In [11]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn import metrics

import aif360
from aif360.algorithms.preprocessing import DisparateImpactRemover

np.random.seed(1)

In [12]:
# read data
df = pd.read_csv('../data/preprocessed_data.csv')

In [13]:
# one hot encode helpful columns
categoricalFeatures = ['WEAPON I', 'DRUG I', 'AGE TO', 'PAROLEE I', 'CPD ARREST I', 'AGE CURR', 'COMMUNITY AREA']

for feature in categoricalFeatures:
    onehot = pd.get_dummies(df[feature], prefix=feature)
    df = df.drop(feature, axis=1)
    df = df.join(onehot)

In [14]:
encoded_df = df.copy()
x = df.drop(['SSL SCORE'], axis = 1)
y = encoded_df['SSL SCORE']

In [15]:
scaler = StandardScaler()
data_std = scaler.fit_transform(x)
x_train,x_test,y_train,y_test = train_test_split(x, y, test_size=0.2, random_state = 0)

In [16]:
actual_test = x_test.copy()
actual_test['SSL_Actual'] = y_test
actual_test.shape

(45408, 116)

# Disparate Impact on Testing Data

In [17]:
# Priviliged group: Old (1)
# Unpriviliged group: Young (0)
old_df = actual_test[actual_test['AGE GROUP'] == 0]
num_of_priviliged = old_df.shape[0]
young_df = actual_test[actual_test['AGE GROUP'] == 1]
num_of_unpriviliged = young_df.shape[0]

In [18]:
unpriviliged_outcomes = young_df[young_df['SSL_Actual'] == 1].shape[0]
unpriviliged_ratio = unpriviliged_outcomes/num_of_unpriviliged
unpriviliged_ratio

0.4100127641469295

In [19]:
priviliged_outcomes = old_df[old_df['SSL_Actual'] == 1].shape[0]
priviliged_ratio = priviliged_outcomes/num_of_priviliged
priviliged_ratio

0.999340342197485

In [20]:
# Calculating disparate impact
disparate_impact = unpriviliged_ratio / priviliged_ratio
print("Disparate Impact, Age vs. SSL Score: " + str(disparate_impact))

Disparate Impact, Age vs. SSL Score: 0.4102834108001063


# Logistic Regression Model

In [21]:
from sklearn.linear_model import LogisticRegression
# Liblinear is a solver that is very fast for small datasets, like ours
model = LogisticRegression(solver='liblinear', class_weight='balanced')

In [22]:
model.fit(x_train, y_train)

In [23]:
# Predicts 5/5 correctly
y_pred = pd.Series(model.predict(x_test))
y_test = y_test.reset_index(drop=True)
z = pd.concat([y_test, y_pred], axis=1)
z.columns = ['True', 'Prediction']
z.head()

Unnamed: 0,True,Prediction
0,1,1
1,1,1
2,1,1
3,1,1
4,1,1


In [24]:
print("Accuracy:", metrics.accuracy_score(y_test, y_pred))
print("Precision:", metrics.precision_score(y_test, y_pred))
print("Recall:", metrics.recall_score(y_test, y_pred))

Accuracy: 0.7235068710359408
Precision: 0.7246661516388919
Recall: 0.9975388915896938


# Disparate Impact on Predicted Values by LR Model

In [25]:
y_pred = model.predict(x_test)
x_test['SSL_Predicted'] = y_pred
original_output = x_test
original_output

Unnamed: 0,SEX CODE CD,RACE CODE CD,AGE GROUP,STOP ORDER NO,LATEST DATE,LATEST DIST,LATEST DIST RES,WEAPONS ARR CNT,LATEST WEAPON ARR DATE,NARCOTICS ARR CNT,...,COMMUNITY AREA_WASHINGTON PARK,COMMUNITY AREA_WEST ELSDON,COMMUNITY AREA_WEST ENGLEWOOD,COMMUNITY AREA_WEST GARFIELD PARK,COMMUNITY AREA_WEST LAWN,COMMUNITY AREA_WEST PULLMAN,COMMUNITY AREA_WEST RIDGE,COMMUNITY AREA_WEST TOWN,COMMUNITY AREA_WOODLAWN,SSL_Predicted
182622,1,0,0,0.0,2016,14,14,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,1
191050,0,0,0,0.0,2013,4,4,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,1
64436,0,0,1,0.0,2016,11,15,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,1
138025,1,0,1,0.0,2012,9,9,0.0,0.0,1.0,...,0,0,0,0,0,0,0,0,0,1
4238,1,1,1,0.0,2016,24,31,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3032,1,0,1,0.0,2013,31,16,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,1
159287,0,0,1,0.0,2014,7,3,0.0,0.0,2.0,...,0,0,0,0,0,0,0,0,0,1
75983,1,1,0,0.0,2014,31,31,0.0,0.0,0.0,...,0,0,0,1,0,0,0,0,0,1
80825,1,0,0,0.0,2016,11,15,1.0,2009.0,0.0,...,0,0,0,0,0,0,0,0,0,1


In [26]:
# Priviliged group: White (1)
# Unpriviliged group: Non-White (0)
old_df = original_output[original_output['AGE GROUP'] == 1]
num_of_priviliged = old_df.shape[0]
young_df = original_output[original_output['AGE GROUP'] == 0]
num_of_unpriviliged = young_df.shape[0]

In [27]:
unpriviliged_outcomes = young_df[young_df['SSL_Predicted'] == 1].shape[0]
unpriviliged_ratio = unpriviliged_outcomes/num_of_unpriviliged
unpriviliged_ratio

0.9977324263038548

In [28]:
priviliged_outcomes = old_df[old_df['SSL_Predicted'] == 1].shape[0]
priviliged_ratio = priviliged_outcomes/num_of_priviliged
priviliged_ratio

0.9977308183236421

In [29]:
disparate_impact = unpriviliged_ratio / priviliged_ratio
print("Disparate Impact, Age vs. Predicted SSL: " + str(disparate_impact))

Disparate Impact, Age vs. Predicted SSL: 1.0000016116373107


# Disparate Impact Remover Using AIF360

In [30]:
binaryLabelDataset = aif360.datasets.BinaryLabelDataset(
    favorable_label=1,
    unfavorable_label=0,
    df=encoded_df,
    label_names=['SSL SCORE'],
    protected_attribute_names=['AGE GROUP'])
di = DisparateImpactRemover(repair_level = 1.0)
dataset_transf_train = di.fit_transform(binaryLabelDataset)
transformed = dataset_transf_train.convert_to_dataframe()[0]
transformed

Unnamed: 0,SEX CODE CD,RACE CODE CD,AGE GROUP,STOP ORDER NO,LATEST DATE,LATEST DIST,LATEST DIST RES,WEAPONS ARR CNT,LATEST WEAPON ARR DATE,NARCOTICS ARR CNT,...,COMMUNITY AREA_WASHINGTON PARK,COMMUNITY AREA_WEST ELSDON,COMMUNITY AREA_WEST ENGLEWOOD,COMMUNITY AREA_WEST GARFIELD PARK,COMMUNITY AREA_WEST LAWN,COMMUNITY AREA_WEST PULLMAN,COMMUNITY AREA_WEST RIDGE,COMMUNITY AREA_WEST TOWN,COMMUNITY AREA_WOODLAWN,SSL SCORE
0,1.0,0.0,0.0,0.0,2012.0,31.0,10.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1,1.0,1.0,0.0,0.0,2014.0,17.0,31.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
2,0.0,1.0,1.0,0.0,2016.0,1.0,31.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,1.0,0.0,0.0,2015.0,31.0,31.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
4,0.0,0.0,0.0,0.0,2016.0,31.0,31.0,0.0,0.0,3.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
227031,0.0,0.0,1.0,0.0,2015.0,9.0,9.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
227032,1.0,0.0,1.0,0.0,2016.0,16.0,12.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
227033,1.0,0.0,0.0,0.0,2007.0,9.0,9.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
227034,0.0,1.0,1.0,0.0,2013.0,8.0,31.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [31]:
x_trans = transformed.drop(['SSL SCORE'], axis = 1)
y = transformed['SSL SCORE']
model = LogisticRegression(solver='liblinear', class_weight='balanced')
scaler = StandardScaler()
data_std = scaler.fit_transform(x_trans)
x_trans_train,x_trans_test,y_trans_train,y_trans_test = train_test_split(x_trans, y, test_size=0.2, random_state = 0)

In [32]:
model.fit(x_trans_train, y_trans_train)

In [33]:
# 5/5 correct predictions
y_trans_pred = pd.Series(model.predict(x_trans_test))
y_trans_test = y_trans_test.reset_index(drop=True)
z = pd.concat([y_trans_test, y_trans_pred], axis=1)
z.columns = ['True', 'Prediction']
z.head()

Unnamed: 0,True,Prediction
0,1.0,0.0
1,1.0,0.0
2,1.0,0.0
3,1.0,0.0
4,1.0,0.0


In [34]:
print("Accuracy:", metrics.accuracy_score(y_test, y_trans_pred))
print("Precision:", metrics.precision_score(y_test, y_trans_pred))
print("Recall:", metrics.recall_score(y_test, y_trans_pred))

Accuracy: 0.2806553911205074
Precision: 0.7012987012987013
Recall: 0.013125911521633447


In [35]:
y_trans_pred = model.predict(x_trans_test)
x_trans_test['SSL_Predicted'] = y_trans_pred
transformed_output = x_trans_test
transformed_output

Unnamed: 0,SEX CODE CD,RACE CODE CD,AGE GROUP,STOP ORDER NO,LATEST DATE,LATEST DIST,LATEST DIST RES,WEAPONS ARR CNT,LATEST WEAPON ARR DATE,NARCOTICS ARR CNT,...,COMMUNITY AREA_WASHINGTON PARK,COMMUNITY AREA_WEST ELSDON,COMMUNITY AREA_WEST ENGLEWOOD,COMMUNITY AREA_WEST GARFIELD PARK,COMMUNITY AREA_WEST LAWN,COMMUNITY AREA_WEST PULLMAN,COMMUNITY AREA_WEST RIDGE,COMMUNITY AREA_WEST TOWN,COMMUNITY AREA_WOODLAWN,SSL_Predicted
182622,1.0,0.0,0.0,0.0,2016.0,14.0,14.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
191050,0.0,0.0,0.0,0.0,2013.0,4.0,4.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
64436,0.0,0.0,1.0,0.0,2016.0,11.0,15.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
138025,1.0,0.0,1.0,0.0,2012.0,9.0,9.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4238,1.0,1.0,1.0,0.0,2016.0,24.0,31.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3032,1.0,0.0,1.0,0.0,2013.0,31.0,16.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
159287,0.0,0.0,1.0,0.0,2014.0,7.0,3.0,0.0,0.0,2.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75983,1.0,1.0,0.0,0.0,2014.0,31.0,31.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
80825,1.0,0.0,0.0,0.0,2016.0,11.0,15.0,1.0,2009.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [36]:
# Priviliged group: White (1)
# Unpriviliged group: Non-White (0)
old_df = transformed_output[transformed_output['AGE GROUP'] == 1]
num_of_priviliged = old_df.shape[0]
young_df = transformed_output[transformed_output['AGE GROUP'] == 0]
num_of_unpriviliged = young_df.shape[0]

In [37]:
unpriviliged_outcomes = young_df[young_df['SSL_Predicted'] == 1].shape[0]
unpriviliged_ratio = unpriviliged_outcomes/num_of_unpriviliged
unpriviliged_ratio

0.01261595547309833

In [38]:
priviliged_outcomes = old_df[old_df['SSL_Predicted'] == 1].shape[0]
priviliged_ratio = priviliged_outcomes/num_of_priviliged
priviliged_ratio

0.014655131659811848

In [39]:
disparate_impact = unpriviliged_ratio / priviliged_ratio
print("Disparate Impact, Age vs. Predicted SSL Score: " + str(disparate_impact))

Disparate Impact, Age vs. Predicted SSL Score: 0.8608558262014482
