In [21]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn import datasets
from sklearn.preprocessing import normalize
from sklearn.impute import SimpleImputer

from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_validate
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

from sklearn.metrics import SCORERS

import seaborn as sns
sns.set(rc={'figure.figsize':(6,6)})
import warnings
warnings.simplefilter("ignore")

%matplotlib inline

**These lines import various libraries such as NumPy, Pandas, Matplotlib, Seaborn, and scikit-learn modules. It also sets some configurations for the plots and ignores warnings.**

In [22]:
df = pd.read_csv("compas-scores-two-years.csv")

**Reads a CSV file named "compas-scores-two-years.csv" into a Pandas DataFrame called df.**

In [23]:
drop_columns = ['compas_screening_date', 'juv_fel_count',
                'juv_misd_count','c_case_number','vr_charge_degree',
                'vr_offense_date','vr_charge_desc','c_arrest_date',
                'r_case_number', 'vr_case_number','start',
                'juv_other_count','days_b_screening_arrest','c_days_from_compas',
                'first','last','name','dob','c_jail_in','c_jail_out','c_offense_date',
                'c_charge_degree','c_charge_desc','r_charge_degree','r_days_from_arrest',
                'r_offense_date','r_charge_desc','r_jail_in','r_jail_out','violent_recid',
                'score_text','screening_date','v_score_text','v_screening_date','in_custody',
                'out_custody','id','end','type_of_assessment','v_type_of_assessment','is_recid','is_violent_recid','event',
               'decile_score.1','priors_count.1', 'age_cat']
df = df.drop(drop_columns,axis=1)

**Drops unnecessary columns specified in the drop_columns list from the DataFrame.**

In [24]:
df.head()

Unnamed: 0,sex,age,race,decile_score,priors_count,v_decile_score,two_year_recid
0,Male,69,Other,1,0,1,0
1,Male,34,African-American,3,0,1,1
2,Male,24,African-American,4,4,3,1
3,Male,23,African-American,8,1,6,0
4,Male,43,Other,1,2,1,0


**Displays the first few rows of the DataFrame for a quick summary.**

In [25]:
df.race = df.race.replace({
    'Asian':'Other',
    'Hispanic':'Other',
    'Native American':'Other'
})

**Consolidates various racial categories into 'Other' and only focuses on African American vs. Caucasian.**

In [26]:
categorical_cols = df.select_dtypes(exclude=np.number).columns
numerical_cols = df.select_dtypes(np.number).columns

In [27]:
numerical = pd.DataFrame(
    SimpleImputer(strategy="median").fit_transform(df[numerical_cols]),
    columns=numerical_cols
)

**Imputes missing values in numerical columns using the median strategy and creates a new DataFrame numerical with the imputed values.**

In [28]:
categorical = pd.get_dummies(df[categorical_cols])
categorical.head()

Unnamed: 0,sex_Female,sex_Male,race_African-American,race_Caucasian,race_Other
0,0,1,0,0,1
1,0,1,1,0,0
2,0,1,1,0,0
3,0,1,1,0,0
4,0,1,0,0,1


**Encodes categorical variables using one-hot encoding and creates a new DataFrame categorical.**

In [29]:
df_new = pd.merge(numerical, categorical, left_index=True, right_index=True)
df_new.head()

Unnamed: 0,age,decile_score,priors_count,v_decile_score,two_year_recid,sex_Female,sex_Male,race_African-American,race_Caucasian,race_Other
0,69.0,1.0,0.0,1.0,0.0,0,1,0,0,1
1,34.0,3.0,0.0,1.0,1.0,0,1,1,0,0
2,24.0,4.0,4.0,3.0,1.0,0,1,1,0,0
3,23.0,8.0,1.0,6.0,0.0,0,1,1,0,0
4,43.0,1.0,2.0,1.0,0.0,0,1,0,0,1


**Merges the numerical and one-hot encoded categorical DataFrames to create a final preprocessed DataFrame df_new.**

## LOGISTIC REGRESSION

In [38]:
target_variable = 'two_year_recid'
independent_variables = df_new.drop(columns=target_variable).columns
X = df_new[independent_variables]
y = df_new[target_variable]

In [39]:
model = LogisticRegression(class_weight = "balanced")

In [40]:
cross_val_score(
    model, X, y, scoring = "accuracy", cv = 5
).mean()

0.6754905551022056

In [41]:
cross_val_score(
    model, X, y, scoring = "roc_auc", cv = 5
).mean()

0.7322738286706476

In [42]:
from sklearn.model_selection import train_test_split
from sklearn import metrics
X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                    test_size=0.2,
                                                    random_state=42)

In [43]:
X_test_addback = X_test[['race_African-American','race_Caucasian','race_Other']]

drops = ['race_African-American','race_Caucasian','race_Other']
X_train = X_train.drop(drops, axis=1)
X_test = X_test.drop(drops, axis=1)

In [44]:
model.fit(X=X_train, y=y_train)
predictions = model.predict(X_test)

X_test["target"] = y_test.tolist()
X_test["prediction"] = predictions

In [45]:
X_rejoin = pd.concat([X_test, X_test_addback], axis = 1)

**The above code goes through the process of building a logistic regression model, evaluating it through cross-validation, splitting the data into training and testing sets, making predictions on the test set, and finally analyzing and rejoining the results. The model uses class weights to address potential class imbalance.**

In [50]:
from sklearn.metrics import confusion_matrix
confusion_matrix(y_test, predictions)

array([[570, 253],
       [216, 404]])

**The above code computes a confusion matrix based on the true labels (y_test) and predicted labels (predictions).**

In [51]:
X_rejoin['race_African-American'].sum()

731

**Counts the number of instances for African American racial category in the rejoined DataFrame (X_rejoin). In this case, 731.**

In [52]:
X_rejoin['race_Caucasian'].sum()

505

**Counts the number of instances for Caucasian racial group in the rejoined DataFrame (X_rejoin).**

In [53]:
false_positives = X_rejoin[(X_rejoin.target == 0.0) & (X_rejoin.prediction == 1.0)]
false_positives

Unnamed: 0,age,decile_score,priors_count,v_decile_score,sex_Female,sex_Male,target,prediction,race_African-American,race_Caucasian,race_Other
5952,26.0,5.0,2.0,4.0,0,1,0.0,1.0,0,1,0
509,35.0,2.0,12.0,1.0,1,0,0.0,1.0,1,0,0
676,22.0,9.0,3.0,8.0,0,1,0.0,1.0,1,0,0
6240,21.0,8.0,1.0,9.0,1,0,0.0,1.0,0,1,0
5321,21.0,4.0,1.0,6.0,0,1,0.0,1.0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...
864,22.0,6.0,1.0,7.0,1,0,0.0,1.0,0,1,0
2818,21.0,8.0,1.0,8.0,0,1,0.0,1.0,1,0,0
4105,33.0,4.0,5.0,2.0,0,1,0.0,1.0,1,0,0
371,57.0,8.0,18.0,7.0,0,1,0.0,1.0,1,0,0


**The above code filters instances where the target is 0 (negative) but predicted as 1 (positive).It then prints the count and rate of false positives for both African American and Caucasian groups.**

In [55]:
print('African American false positive count: ' + str(false_positives['race_African-American'].sum()))
print('African American false positive rate is : ' + str(false_positives['race_African-American'].sum() / X_rejoin['race_African-American'].sum()))
print('\n')
print('Caucasian false positive count: ' + str(false_positives['race_Caucasian'].sum()))
print('Caucasian false positive rate is : ' + str(false_positives['race_Caucasian'].sum() / X_rejoin['race_Caucasian'].sum()))

African American false positive count: 151
African American false positive rate is : 0.20656634746922026


Caucasian false positive count: 68
Caucasian false positive rate is : 0.13465346534653466


In [56]:
false_negatives = X_rejoin[(X_rejoin.target == 1.0) & (X_rejoin.prediction == 0.0)]
false_negatives

Unnamed: 0,age,decile_score,priors_count,v_decile_score,sex_Female,sex_Male,target,prediction,race_African-American,race_Caucasian,race_Other
3238,57.0,4.0,8.0,2.0,0,1,1.0,0.0,1,0,0
3858,52.0,1.0,2.0,1.0,1,0,1.0,0.0,1,0,0
132,28.0,2.0,0.0,3.0,0,1,1.0,0.0,0,1,0
1042,32.0,2.0,2.0,2.0,0,1,1.0,0.0,0,1,0
3353,37.0,5.0,1.0,3.0,0,1,1.0,0.0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...
2609,32.0,3.0,0.0,2.0,1,0,1.0,0.0,1,0,0
730,45.0,4.0,0.0,2.0,0,1,1.0,0.0,1,0,0
290,54.0,2.0,6.0,1.0,0,1,1.0,0.0,0,1,0
3786,36.0,6.0,4.0,3.0,1,0,1.0,0.0,1,0,0


**The above code filters instances where the target is 1 (positive) but predicted as 0 (negative).It then prints the count and rate of false negatives for both African American and Caucasian groups.**

In [57]:
print('African American false negative count: ' + str(false_negatives['race_African-American'].sum()))
print('African American false negative rate is : ' + str(false_negatives['race_African-American'].sum() / X_rejoin['race_African-American'].sum()))
print('\n')
print('Caucasian false negative count: ' + str(false_negatives['race_Caucasian'].sum()))
print('Caucasian false negative rate is : ' + str(false_negatives['race_Caucasian'].sum() / X_rejoin['race_Caucasian'].sum()))

African American false negative count: 92
African American false negative rate is : 0.12585499316005472


Caucasian false negative count: 87
Caucasian false negative rate is : 0.17227722772277226


## **Interpretation**

**Caucasians have a higher false negative rate and African Americans a higher false positive rate.**

**Accordingly, high risk Caucasians are more likely to be categorized as low risk and low risk African Americans are more likely to be classified as high risk.**

## **Implications**

**The disparities in false positive and false negative rates across racial groups may indicate potential bias in the model.**

## **Next Steps:**

**We will consider further analysis, such as fairness metrics, to quantify and mitigate biases.**