In [35]:
from IPython.display import Image
import pandas as pd
import numpy as np
import warnings
warnings.simplefilter("ignore")
from sklearn.preprocessing import normalize
from sklearn.impute import SimpleImputer

import seaborn as sns
sns.set(rc={'figure.figsize':(6,6)})

**The above code imports necessary libraries for working with data, handling warnings, and performing preprocessing tasks. The seaborn library is also imported to set the figure size for visualizations.**

In [36]:
df = pd.read_csv("compas-scores-two-years.csv")

In [37]:
drop_columns = ['compas_screening_date', 'juv_fel_count',
                'juv_misd_count','c_case_number','vr_charge_degree',
                'vr_offense_date','vr_charge_desc','c_arrest_date',
                'r_case_number', 'vr_case_number','start',
                'juv_other_count','days_b_screening_arrest','c_days_from_compas',
                'first','last','name','dob','c_jail_in','c_jail_out','c_offense_date',
                'c_charge_degree','c_charge_desc','r_charge_degree','r_days_from_arrest',
                'r_offense_date','r_charge_desc','r_jail_in','r_jail_out','violent_recid',
                'score_text','screening_date','v_score_text','v_screening_date','in_custody',
                'out_custody','id','end','type_of_assessment','v_type_of_assessment','is_recid','is_violent_recid','event',
               'decile_score.1','priors_count.1']
df = df.drop(drop_columns,axis=1)

**The above code drops a list of unnecessary columns from the DataFrame (df). These columns are considered irrelevant and redundant for the analysis.**

In [38]:
df.head()

Unnamed: 0,sex,age,age_cat,race,decile_score,priors_count,v_decile_score,two_year_recid
0,Male,69,Greater than 45,Other,1,0,1,0
1,Male,34,25 - 45,African-American,3,0,1,1
2,Male,24,Less than 25,African-American,4,4,3,1
3,Male,23,Less than 25,African-American,8,1,6,0
4,Male,43,25 - 45,Other,1,2,1,0


In [39]:
df.race = df.race.replace({
    'Asian':'Other',
    'Hispanic':'Other',
    'Native American':'Other'
})

**The above code consolidates various racial categories into 'Other', focusing on the binary classification of African American and Caucasian.**

In [40]:
categorical_cols = df.select_dtypes(exclude=np.number).columns
numerical_cols = df.select_dtypes(np.number).columns

**The above code identifies categorical and numerical columns in the DataFrame.**

In [41]:
numerical = pd.DataFrame(
    SimpleImputer(strategy="median").fit_transform(df[numerical_cols]),
    columns=numerical_cols
)
numerical.head()

Unnamed: 0,age,decile_score,priors_count,v_decile_score,two_year_recid
0,69.0,1.0,0.0,1.0,0.0
1,34.0,3.0,0.0,1.0,1.0
2,24.0,4.0,4.0,3.0,1.0
3,23.0,8.0,1.0,6.0,0.0
4,43.0,1.0,2.0,1.0,0.0


**The above code imputes missing values in numerical columns using the median strategy and creates a new DataFrame (numerical) with the imputed values.**

In [42]:
categorical = pd.get_dummies(df[categorical_cols])
categorical.head()

Unnamed: 0,sex_Female,sex_Male,age_cat_25 - 45,age_cat_Greater than 45,age_cat_Less than 25,race_African-American,race_Caucasian,race_Other
0,0,1,0,1,0,0,0,1
1,0,1,1,0,0,1,0,0
2,0,1,0,0,1,1,0,0
3,0,1,0,0,1,1,0,0
4,0,1,1,0,0,0,0,1


**The above code encodes categorical variables using one-hot encoding and creates a new DataFrame (categorical).**

In [43]:
df_new = pd.merge(numerical, categorical, left_index=True, right_index=True)
df_new.head()

Unnamed: 0,age,decile_score,priors_count,v_decile_score,two_year_recid,sex_Female,sex_Male,age_cat_25 - 45,age_cat_Greater than 45,age_cat_Less than 25,race_African-American,race_Caucasian,race_Other
0,69.0,1.0,0.0,1.0,0.0,0,1,0,1,0,0,0,1
1,34.0,3.0,0.0,1.0,1.0,0,1,1,0,0,1,0,0
2,24.0,4.0,4.0,3.0,1.0,0,1,0,0,1,1,0,0
3,23.0,8.0,1.0,6.0,0.0,0,1,0,0,1,1,0,0
4,43.0,1.0,2.0,1.0,0.0,0,1,1,0,0,0,0,1


**The above code merges the numerical and one-hot encoded categorical DataFrames to create a final preprocessed DataFrame (df_new).**

In [44]:
df_new.to_csv('useful-two-year.csv')

**The resulting 'useful-two-year.csv' file contains the preprocessed data that is used for model training.**

## **Decision Tree Model**

**We remove the race column at first and add it back later when our model fits and predicts the data since we are looking for bias in that column.**

# Feature Selection and Data Splitting:

In [45]:
df_addback = df_new[['race_African-American','race_Caucasian','race_Other']]
drops = ['race_African-American','race_Caucasian','race_Other']
df_new = df_new.drop(drops, axis=1)

target_variable = 'two_year_recid'
independent_variables = df_new.drop(columns=target_variable).columns
X = df_new[independent_variables]
y = df_new[target_variable]

**In the above code, df_addback is created to store the race columns that were temporarily removed during model training.The race columns are then dropped from the DataFrame (df_new), as they were excluded during the decision tree model training.**

**After that the code defines the target variable (y) and independent variables (X) for the decision tree model.**

In [46]:
df_new.head()

Unnamed: 0,age,decile_score,priors_count,v_decile_score,two_year_recid,sex_Female,sex_Male,age_cat_25 - 45,age_cat_Greater than 45,age_cat_Less than 25
0,69.0,1.0,0.0,1.0,0.0,0,1,0,1,0
1,34.0,3.0,0.0,1.0,1.0,0,1,1,0,0
2,24.0,4.0,4.0,3.0,1.0,0,1,0,0,1
3,23.0,8.0,1.0,6.0,0.0,0,1,0,0,1
4,43.0,1.0,2.0,1.0,0.0,0,1,1,0,0


# Decision Tree Model Training and Cross-Validation:

In [47]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import cross_val_score

tree = DecisionTreeClassifier()

In [48]:
cross_val_score(
    tree, X, y, scoring = "roc_auc", cv = 5
).mean()

0.5951055342894521

**The above code initializes a Decision Tree Classifier and performs 5-fold cross-validation, measuring the area under the ROC curve (ROC AUC) as the evaluation metric.**

# Feature Importance Analysis:

In [49]:
tree.fit(X, y)
tree.predict(X)[:10]

array([0., 1., 1., 0., 0., 0., 1., 0., 0., 0.])

In [50]:
dict(zip(
    independent_variables,
    tree.feature_importances_
))

{'age': 0.3083914933727195,
 'decile_score': 0.26120304983950826,
 'priors_count': 0.19306066431994978,
 'v_decile_score': 0.1770223667947748,
 'sex_Female': 0.02206901890739292,
 'sex_Male': 0.015193748368656177,
 'age_cat_25 - 45': 0.0061518299278011555,
 'age_cat_Greater than 45': 0.005943047027472511,
 'age_cat_Less than 25': 0.010964781441725054}

**The above code trains the decision tree on the entire dataset (X, y).Then,prints the first 10 predictions.It also outputs the feature importance scores for each independent variable.**

# Hyperparameter Tuning:

In [51]:
depths = np.arange(2,10)
results = []
for depth in depths:
    best_depth_tree = DecisionTreeClassifier(max_depth = depth) # creating an instance of a decision tree
    results.append(cross_val_score(best_depth_tree, X,
                                   y, scoring="roc_auc", # getting the cv accuracy metric for the tree at each depth
                cv=3).mean())

In [52]:
test = pd.DataFrame({'depths':depths, 'mean_roc_auc':results})
test.sort_values("mean_roc_auc", ascending=False)

Unnamed: 0,depths,mean_roc_auc
3,5,0.723594
4,6,0.721002
2,4,0.718966
5,7,0.715797
1,3,0.710308
6,8,0.701194
0,2,0.687591
7,9,0.685449


**The above code tests different tree depths and selects the best-performing depth based on cross-validated ROC AUC scores.**

# Visualizing the Decision Tree:

In [60]:
# running with max depth 5 for best accuracy
simple_tree = DecisionTreeClassifier(max_depth=5)
simple_tree.fit(X, y)

In [54]:
simple_tree.fit(X, y)
predictions = simple_tree.predict(X)

In [55]:
import graphviz
from sklearn.tree import export_graphviz

def draw_tree(tree):
    dot_data = export_graphviz(tree, out_file=None,
                         feature_names=independent_variables,
                               class_names=['did recidivate', 'did not recidivate'],
                         filled=True,
                         rounded=True,
                         special_characters=True,
                              proportion = True)

    graph = graphviz.Source(dot_data)
    graph.format = 'png'
    graph.render('tree',view=True)

draw_tree(simple_tree)

# Joining Predictions and Race Information Back to the Data:

In [56]:
df_new['predictions'] = predictions
df_rejoin = pd.concat([df_new, df_addback], axis = 1)

**The above code adds the predictions from the decision tree model back to the DataFrame (df_new).It also concatenates the original race information (df_addback) to the DataFrame to analyze the predictions with respect to race.**

In [57]:
df_rejoin.head()

Unnamed: 0,age,decile_score,priors_count,v_decile_score,two_year_recid,sex_Female,sex_Male,age_cat_25 - 45,age_cat_Greater than 45,age_cat_Less than 25,predictions,race_African-American,race_Caucasian,race_Other
0,69.0,1.0,0.0,1.0,0.0,0,1,0,1,0,0.0,0,0,1
1,34.0,3.0,0.0,1.0,1.0,0,1,1,0,0,0.0,1,0,0
2,24.0,4.0,4.0,3.0,1.0,0,1,0,0,1,1.0,1,0,0
3,23.0,8.0,1.0,6.0,0.0,0,1,0,0,1,0.0,1,0,0
4,43.0,1.0,2.0,1.0,0.0,0,1,1,0,0,0.0,0,0,1


**The above code displays the head of the DataFrame (df_rejoin) containing predictions and race information as shown.**

## **Interpretation-**

**Here, the decision tree model was trained without considering the "race" column, which was an attempt to mitigate potential bias associated with this attribute.**

**After making predictions, the "race" column will be added back for the analysis of false positives and false negatives to understand how the model performs across different racial groups as shown below.**

In [61]:
false_positives = df_rejoin[(df_rejoin.two_year_recid == 0.0) & (df_rejoin.predictions == 1.0)]
false_positives

Unnamed: 0,age,decile_score,priors_count,v_decile_score,two_year_recid,sex_Female,sex_Male,age_cat_25 - 45,age_cat_Greater than 45,age_cat_Less than 25,predictions,race_African-American,race_Caucasian,race_Other
17,25.0,10.0,3.0,9.0,0.0,0,1,1,0,0,1.0,1,0,0
35,26.0,8.0,6.0,8.0,0.0,0,1,1,0,0,1.0,0,1,0
40,21.0,8.0,2.0,8.0,0.0,1,0,0,0,1,1.0,1,0,0
54,31.0,5.0,15.0,7.0,0.0,0,1,1,0,0,1.0,1,0,0
58,53.0,5.0,8.0,2.0,0.0,0,1,0,1,0,1.0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7158,21.0,6.0,1.0,5.0,0.0,0,1,0,0,1,1.0,1,0,0
7167,30.0,9.0,2.0,7.0,0.0,0,1,1,0,0,1.0,0,1,0
7169,22.0,5.0,0.0,7.0,0.0,0,1,0,0,1,1.0,1,0,0
7198,32.0,5.0,4.0,3.0,0.0,0,1,1,0,0,1.0,1,0,0


**The above code selects rows where the true label is "did not recidivate" (two_year_recid == 0.0) but the model predicted "did recidivate" (predictions == 1.0).**

In [62]:
print('African American false positive count: ' + str(false_positives['race_African-American'].sum()))
print('African American false positive rate is : ' + str(false_positives['race_African-American'].sum() / df_rejoin['race_African-American'].sum()))
print('\n')
print('Caucasian false positive count: ' + str(false_positives['race_Caucasian'].sum()))
print('African American false positive rate is : ' + str(false_positives['race_Caucasian'].sum() / df_rejoin['race_Caucasian'].sum()))


African American false positive count: 538
African American false positive rate is : 0.14556277056277056


Caucasian false positive count: 210
African American false positive rate is : 0.08557457212713937


**The above code calculates and prints the false positive count and false positive rate for both African American and Caucasian groups.**

In [63]:
false_negatives = df_rejoin[(df_rejoin.two_year_recid == 1.0) & (df_rejoin.predictions == 0.0)]
false_negatives

Unnamed: 0,age,decile_score,priors_count,v_decile_score,two_year_recid,sex_Female,sex_Male,age_cat_25 - 45,age_cat_Greater than 45,age_cat_Less than 25,predictions,race_African-American,race_Caucasian,race_Other
1,34.0,3.0,0.0,1.0,1.0,0,1,1,0,0,0.0,1,0,0
9,21.0,3.0,1.0,5.0,1.0,0,1,0,0,1,0.0,0,1,0
14,47.0,1.0,1.0,1.0,1.0,1,0,0,1,0,0.0,0,1,0
22,27.0,2.0,0.0,3.0,1.0,0,1,1,0,0,0.0,0,1,0
24,24.0,4.0,1.0,5.0,1.0,0,1,0,0,1,0.0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7189,34.0,2.0,3.0,2.0,1.0,0,1,1,0,0,0.0,1,0,0
7190,24.0,3.0,2.0,4.0,1.0,0,1,0,0,1,0.0,1,0,0
7194,30.0,1.0,2.0,1.0,1.0,1,0,1,0,0,0.0,0,1,0
7207,30.0,2.0,0.0,2.0,1.0,0,1,1,0,0,0.0,1,0,0


**The above code selects rows where the true label is "did recidivate" (two_year_recid == 1.0) but the model predicted "did not recidivate" (predictions == 0.0).**

In [64]:
print('African American false negative count: ' + str(false_negatives['race_African-American'].sum()))
print('African American false negative rate is : ' + str(false_negatives['race_African-American'].sum() / df_rejoin['race_African-American'].sum()))
print('\n')
print('Caucasian false negative count: ' + str(false_negatives['race_Caucasian'].sum()))
print('Caucasian false negative rate is : ' + str(false_negatives['race_Caucasian'].sum() / df_rejoin['race_Caucasian'].sum()))


African American false negative count: 615
African American false negative rate is : 0.1663961038961039


Caucasian false negative count: 535
Caucasian false negative rate is : 0.2180114099429503


**The above code calculates and prints the false negative count and false negative rate for both African American and Caucasian groups.**

# **Interpretation-**

**False positives occur when the model predicts recidivism, but the individual does not actually recidivate where as false negatives occur when the model predicts no recidivism, but the individual does recidivate.**

**Caucasians have a greater false negative rate (22% vs 16.6%) and African Americans have a higher false positive rate (14.5% vs 8.5%).This indicates that the COMPAS algorithm is more likely to classify Caucasians as having a lower risk of recidivism and African Americans as having a higher risk.**