In [36]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
# from scipy.stats import norm
from scipy import stats
# from sklearn import (
#     linear_model, metrics, pipeline, preprocessing, model_selection
# )
from sklearn.linear_model import LogisticRegression, LinearRegression, Lasso, Ridge
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.tree import DecisionTreeRegressor,DecisionTreeClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report,mean_absolute_error,r2_score
from time import time

from sklearn import model_selection
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.preprocessing import OneHotEncoder, StandardScaler
# from sklearn.model_selection import cross_val_score
# from sklearn.model_selection import RepeatedKFold
import warnings
warnings.filterwarnings('ignore')

## Load Data

In [2]:
dataURL = 'https://raw.githubusercontent.com/propublica/compas-analysis/master/compas-scores-two-years-violent.csv'
raw_data = pd.read_csv(dataURL)
print(raw_data.shape)
raw_data.columns

(4743, 54)


Index(['id', 'name', 'first', 'last', 'compas_screening_date', 'sex', 'dob',
       'age', 'age_cat', 'race', 'juv_fel_count', 'decile_score',
       'juv_misd_count', 'juv_other_count', 'priors_count',
       'days_b_screening_arrest', 'c_jail_in', 'c_jail_out', 'c_case_number',
       'c_offense_date', 'c_arrest_date', 'c_days_from_compas',
       'c_charge_degree', 'c_charge_desc', 'is_recid', 'r_case_number',
       'r_charge_degree', 'r_days_from_arrest', 'r_offense_date',
       'r_charge_desc', 'r_jail_in', 'r_jail_out', 'violent_recid',
       'is_violent_recid', 'vr_case_number', 'vr_charge_degree',
       'vr_offense_date', 'vr_charge_desc', 'type_of_assessment',
       'decile_score.1', 'score_text', 'screening_date',
       'v_type_of_assessment', 'v_decile_score', 'v_score_text',
       'v_screening_date', 'in_custody', 'out_custody', 'priors_count.1',
       'start', 'end', 'event', 'two_year_recid', 'two_year_recid.1'],
      dtype='object')

In [3]:
compas_df = raw_data.loc[
    (raw_data['days_b_screening_arrest'] <= 30) &
    (raw_data['days_b_screening_arrest'] >= -30) &
    (raw_data['is_recid'] != -1) &
    (raw_data['c_charge_degree'] != "O") &
    (raw_data['score_text'] != "N/A")
]
len(compas_df)

4020

## Recidivism Classification

In [4]:
df = compas_df.copy()

#### Handle simple bias and unbalance. 
we can clearly see that the sample is unbalanced: Asian and Native Amerian samples are too small -> inapprociate to analyze and make predictions for them -> we'll add them to the Other group. This also makes the sample data more balanced.

In [5]:
df.loc[df['race'].isin(['Native American', 'Asian']), 'race'] = "Other"
df['race'].value_counts()

African-American    1918
Caucasian           1459
Hispanic             355
Other                288
Name: race, dtype: int64

In [6]:
# df['juv_total'] = df[['juv_fel_count', 'juv_misd_count', 'juv_other_count']].sum(axis = 1)

### prepare pipelines to transform data

In [7]:
# pipeline to transform data
categorical_transformer = Pipeline(steps = [('onehot', OneHotEncoder(drop = 'if_binary'))])
numerical_transformer = Pipeline(steps = [('scale', StandardScaler())])

In [22]:
def evaluate_classification_models(model, y_test, y_preds):
        print(model)
        # print("\t Training Time: %0.8f" % train_time)
        # print("\t Prediction Time: %0.8f" % pred_time)
        print("\t Accuracy Score: %0.5f" % accuracy_score(y_test, y_preds))
        # print("\t Classification Report","\n", classification_report(y_test, y_preds))
        print()

In [20]:
# function to run models
def run_model(features, target, stratify, size, classifiers, categorical_features, numerical_features, evaluation):
   
    # split train and test data
    X_train, X_test, y_train, y_test = model_selection.train_test_split(
        features, 
        target, 
        stratify = stratify,
        test_size = size, 
        random_state = 3)
    
    # preprocessor
    preprocessor = ColumnTransformer(transformers = [
        ('cat_preprocess', categorical_transformer, categorical_features),
        ('num_preprocess', numerical_transformer, numerical_features)
    ])
    
    # run each model on our model pipeline
    for c in classifiers:
        model = Pipeline(steps = [
        ('preprocessor', preprocessor),
        ('classifier', c)])
        start = time()
        model.fit(X_train, y_train)
        train_time = time() - start
        y_preds = model.predict(X_test)
        pred_time = time() - (start + train_time)
        evaluation(c, y_test, y_preds)

In [39]:
classifiers = [
    LogisticRegression(random_state=3),
    DecisionTreeClassifier(random_state=3),
    RandomForestClassifier(random_state=3),
]

In [11]:
target = df[['two_year_recid']]
stratify = df['race']

In [40]:
run_model(
    df[['sex','age','race','priors_count','c_charge_degree']], 
    target,
    stratify, 
    0.25,
    classifiers, 
    ['race','sex','c_charge_degree'],
    ['age','priors_count'],
    evaluate_classification_models
)

LogisticRegression(random_state=3)
	 Accuracy Score: 0.83781

DecisionTreeClassifier(random_state=3)
	 Accuracy Score: 0.79502

RandomForestClassifier(random_state=3)
	 Accuracy Score: 0.81891



In [41]:
run_model(
    df[['sex','age','race','priors_count','c_charge_degree','juv_fel_count', 'juv_misd_count', 'juv_other_count']], 
    target,
    stratify, 
    0.2,
    classifiers, 
    ['race','sex','c_charge_degree'],
    ['age','priors_count','juv_fel_count', 'juv_misd_count', 'juv_other_count'],
    evaluate_classification_models
)

LogisticRegression(random_state=3)
	 Accuracy Score: 0.83458

DecisionTreeClassifier(random_state=3)
	 Accuracy Score: 0.80473

RandomForestClassifier(random_state=3)
	 Accuracy Score: 0.80721



In [42]:
classifiers = [
    LogisticRegression(random_state=42),
    DecisionTreeClassifier(random_state=42),
    RandomForestClassifier(n_estimators = 1000, random_state = 42),
]

In [43]:
run_model(
    df[['sex','age','race','priors_count','c_charge_degree','juv_fel_count', 'juv_misd_count', 'juv_other_count']], 
    target,
    stratify, 
    0.2,
    classifiers, 
    ['race','sex','c_charge_degree'],
    ['age','priors_count','juv_fel_count', 'juv_misd_count', 'juv_other_count'],
    evaluate_classification_models
)

LogisticRegression(random_state=42)
	 Accuracy Score: 0.83458

DecisionTreeClassifier(random_state=42)
	 Accuracy Score: 0.80721

RandomForestClassifier(n_estimators=1000, random_state=42)
	 Accuracy Score: 0.80721



## COMPAS scores for recidivism

In [37]:
def evaluate_regression_models(model, y_test, y_preds):
        print(model)
        # print("\t Training Time: %0.8f" % train_time)
        # print("\t Prediction Time: %0.8f" % pred_time)
        print("\t MAE: %0.5f" % mean_absolute_error(y_test, y_preds)),
        print("\t R2 Score: %05f" % r2_score(y_test, y_preds)),
        print()

In [31]:
regressors = [
    LinearRegression(),
    Lasso(),
    Ridge(),
    DecisionTreeRegressor(),
    RandomForestRegressor()
]

In [32]:
linear_target = df['decile_score']

In [38]:
run_model(
    df[['sex','age','race','priors_count','c_charge_degree','juv_fel_count', 'juv_misd_count', 'juv_other_count']], 
    linear_target,
    stratify, 
    0.2,
    regressors, 
    ['race','sex','c_charge_degree'],
    ['age','priors_count','juv_fel_count', 'juv_misd_count', 'juv_other_count'],
    evaluate_regression_models
)

LinearRegression()
	 MAE: 1.62923
	 R2 Score: 0.444618

Lasso()
	 MAE: 2.22661
	 R2 Score: 0.113033

Ridge()
	 MAE: 1.62923
	 R2 Score: 0.444923

DecisionTreeRegressor()
	 MAE: 1.87046
	 R2 Score: 0.162157

RandomForestRegressor()
	 MAE: 1.70030
	 R2 Score: 0.345064

