In [64]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
from scipy import stats

from sklearn.linear_model import LogisticRegression

from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, f1_score,confusion_matrix, classification_report,mean_absolute_error,r2_score, roc_auc_score
from time import time
from sklearn.model_selection import cross_val_score, train_test_split,cross_validate, StratifiedKFold
from xgboost import XGBClassifier
# from sklearn.feature_selection import SelectKBest, chi2
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
import warnings
import os
warnings.filterwarnings('ignore')

In [2]:
rand = 3
os.environ['PYTHONHASHSEED']=str(rand)
np.random.seed(rand)

## Load Data

In [3]:
dataURL = 'https://raw.githubusercontent.com/propublica/compas-analysis/master/compas-scores-two-years.csv'
raw_data = pd.read_csv(dataURL)
print(raw_data.shape)
raw_data.columns

(7214, 53)


Index(['id', 'name', 'first', 'last', 'compas_screening_date', 'sex', 'dob',
       'age', 'age_cat', 'race', 'juv_fel_count', 'decile_score',
       'juv_misd_count', 'juv_other_count', 'priors_count',
       'days_b_screening_arrest', 'c_jail_in', 'c_jail_out', 'c_case_number',
       'c_offense_date', 'c_arrest_date', 'c_days_from_compas',
       'c_charge_degree', 'c_charge_desc', 'is_recid', 'r_case_number',
       'r_charge_degree', 'r_days_from_arrest', 'r_offense_date',
       'r_charge_desc', 'r_jail_in', 'r_jail_out', 'violent_recid',
       'is_violent_recid', 'vr_case_number', 'vr_charge_degree',
       'vr_offense_date', 'vr_charge_desc', 'type_of_assessment',
       'decile_score.1', 'score_text', 'screening_date',
       'v_type_of_assessment', 'v_decile_score', 'v_score_text',
       'v_screening_date', 'in_custody', 'out_custody', 'priors_count.1',
       'start', 'end', 'event', 'two_year_recid'],
      dtype='object')

In [4]:
compas_df = raw_data.loc[
    (raw_data['days_b_screening_arrest'] <= 30) &
    (raw_data['days_b_screening_arrest'] >= -30) &
    (raw_data['is_recid'] != -1) &
    (raw_data['c_charge_degree'] != "O") &
    (raw_data['score_text'] != "N/A")
]
len(compas_df)

6172

## Recidivism Classification

### Prepare Data

In [25]:
df = compas_df.copy()

#### Check data bias. 
we can clearly see that the sample is unbalanced: Asian and Native Amerian samples are too small -> inapprociate to analyze and make predictions for them -> we'll add them to the Other group. This also makes the sample data more balanced.

In [26]:
df.loc[df['race'].isin(['Native American', 'Asian']), 'race'] = "Other"
df['race'].value_counts()

African-American    3175
Caucasian           2103
Hispanic             509
Other                385
Name: race, dtype: int64

#### Check Data Balance

In [36]:
not_recid = (df['two_year_recid'].sum()/df['two_year_recid'].shape[0])*100

print("Not Recidivism Rate: %.2f%%" % not_recid)
print("Recidivism Rate: %.2f%%" % (100-not_recid))

Not Recidivism Rate: 45.51%
Recidivism Rate: 54.49%


#### Preprocess Steps

In [27]:
# copy the original columns for reference later
df[['race_ref','age_ref','c_charge_degree']] = df[['race','sex','c_charge_degree']]

In [28]:
# Label Encode Categorical Data
df[['race','sex','c_charge_degree']] = df[['race','sex','c_charge_degree']].apply(LabelEncoder().fit_transform)

In [38]:
# df.head()

In [39]:
features = df[['sex','age','race','priors_count','c_charge_degree']]
target = df['two_year_recid']
stratify = df['two_year_recid']

In [40]:
X_train, X_test, y_train, y_test = train_test_split(
        features, 
        target, 
        stratify = stratify,
        test_size = 0.25,
        random_state = rand
)

## Modeling & Evaluation

### Define Models to train

In [68]:
classifiers = {
    'Logistic Regression': {'model': Pipeline(steps = [
            ('scale', StandardScaler()),
            ('classifier', LogisticRegression(random_state = rand))
             ])},
    'Decision Tree' : {'model': DecisionTreeClassifier(random_state = rand)},
    'Random Forest' : {'model': RandomForestClassifier(random_state = rand)},
    'XGBoost': {'model': XGBClassifier(random_state = rand)}
}

In [69]:
cv_result = pd.DataFrame(columns= ['Model', 
                                   'AVG F1', 
                                   'AVG AUC', 
                                   'AVG Precision', 
                                   'AVG Recall', 
                                   'AVG Accuracy'])

In [70]:
for model_name in classifiers.keys():
    model = classifiers[model_name]['model']
    
    # define scoring metrics
    scoring = ['f1','roc_auc','precision','recall','accuracy']
        
    # generate cross validation for with defined random state
    skf = StratifiedKFold(n_splits = 5, random_state = rand, shuffle = True)
        
    # run cross validation
    scores = cross_validate(
        model, 
        X_train, 
        y_train, 
        scoring = scoring, 
        cv = skf)
    cv_result = cv_result.append({'Model': model_name, 
                                  'AVG F1': scores['test_f1'].mean(), 
                                  'AVG AUC': scores['test_roc_auc'].mean(),
                                  'AVG Precision': scores['test_precision'].mean(), 
                                   'AVG Recall': scores['test_recall'].mean(), 
                                   'AVG Accuracy': scores['test_accuracy'].mean()
                                 }, ignore_index = True
                                )

In [73]:
# print the result, sort by the AVG F1
cv_result.sort_values(by='AVG F1', ascending=False)

Unnamed: 0,Model,AVG F1,AVG AUC,AVG Precision,AVG Recall,AVG Accuracy
0,Logistic Regression,0.603266,0.725765,0.677022,0.544362,0.674225
3,XGBoost,0.597061,0.699279,0.642393,0.559087,0.656729
2,Random Forest,0.584983,0.67675,0.594487,0.57665,0.627782
1,Decision Tree,0.551325,0.632899,0.595979,0.513532,0.61979


### Tuning

#### Random Search

In [113]:
# define params grid for decision tree
dt_param_dis = {
    'model__criterion': ['gini','entropy'],
    'model__splitter': ['best','random'],
    'model__max_depth': [2,5,10,20,40,None],
    'model__min_samples_split': [2,5,10,15],
    'model__max_features': ['auto','sqrt','log2',None]}

In [88]:
# define params grid for random forest
rf_param_grid = {
    'n_estimators': [int(x) for x in np.linspace(start = 100, stop = 200, num = 20)],
    'max_features': ['auto', 'sqrt'],
    'max_depth': [int(x) for x in np.linspace(80, 100, num = 10)],
    'min_samples_split': [int(x) for x in np.linspace(start = 2, stop = 20, num = 5)],
    'min_samples_leaf': [int(x) for x in np.linspace(start = 1, stop = 20, num = 2)],
    'bootstrap': [True, False]
}

In [97]:
# define params grid for XGB
xgb_param_grid = {
    'max_depth': [int(x) for x in np.linspace(start = 3, stop = 5, num = 1)],
    'learning_rate': [float(x) for x in np.arange(start = 0.1, stop = 0.5, step = 0.1)],
    'n_estimators': [int(x) for x in np.linspace(start = 300, stop = 500, num = 50)],
    'reg_alpha': [float(x) for x in np.arange(start = 2.5, stop = 5.0, step = 0.1)],
    'reg_lambda': [float(x) for x in np.arange(start = 2.5, stop = 5.0, step = 0.1)]
}

In [116]:
tuning_classifiers = {
    'Decision Tree' : 
    {'model': DecisionTreeClassifier(random_state = rand), 'param_grid': dt_param_dis},
    'Random Forest' : 
    {'model': RandomForestClassifier(random_state = rand), 'param_grid': rf_param_grid},
    'XGBoost': 
    {'model': XGBClassifier(random_state = rand), 'param_grid': xgb_param_grid},
    
}

In [117]:
for model_name in tuning_classifiers.keys():
    rs = RandomizedSearchCV(
        estimator = tuning_classifiers[i]['model'], 
        param_distributions = tuning_classifiers[i]['param_grid'], 
        n_iter = 100, 
        cv = 5, 
        scoring ='f1_micro')
    rs.fit(X_train, y_train)
    print(model_name)
    print('best score = ' + str(rs.best_score_))
    print('best params = ' + str(rs.best_params_))
    print()

Decision Tree
best score = 0.6880539373066371
best params = {'reg_lambda': 4.900000000000002, 'reg_alpha': 2.7, 'n_estimators': 414, 'max_depth': 3, 'learning_rate': 0.1}

Random Forest
best score = 0.6889181016870002
best params = {'reg_lambda': 3.700000000000001, 'reg_alpha': 2.9000000000000004, 'n_estimators': 451, 'max_depth': 3, 'learning_rate': 0.2}

XGBoost
best score = 0.6882694530383515
best params = {'reg_lambda': 2.9000000000000004, 'reg_alpha': 3.2000000000000006, 'n_estimators': 361, 'max_depth': 3, 'learning_rate': 0.2}



## Model Interpretations

#### Logistic Regression

In [118]:
coefs_log = classifiers['Logistic Regression']['model'].steps[1][1].coef_[0]
intercept_log = classifiers['Logistic Regression']['model'].steps[1][1].intercept_
print('coefficients:\t%s' % coefs_log)
print('intercept:\t%s' % intercept_log)

AttributeError: 'LogisticRegression' object has no attribute 'coef_'

In [119]:
lg = Pipeline(steps = [
            ('scale', StandardScaler()),
            ('classifier', LogisticRegression(random_state = rand))
             ])

In [120]:
lg.fit(X_train, y_train)