In [187]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
# from scipy.stats import norm
from scipy import stats
# from sklearn import (
#     linear_model, metrics, pipeline, preprocessing, model_selection
# )
from sklearn.linear_model import LogisticRegression, LinearRegression, Lasso, Ridge
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.tree import DecisionTreeRegressor,DecisionTreeClassifier
from sklearn.metrics import accuracy_score, f1_score,confusion_matrix, classification_report,mean_absolute_error,r2_score
from time import time
from sklearn.model_selection import cross_val_score


from sklearn import model_selection
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.preprocessing import OneHotEncoder, StandardScaler
import warnings
warnings.filterwarnings('ignore')

## Load Data

In [73]:
dataURL = 'https://raw.githubusercontent.com/propublica/compas-analysis/master/compas-scores-two-years-violent.csv'
raw_data = pd.read_csv(dataURL)
print(raw_data.shape)
raw_data.columns

(4743, 54)


Index(['id', 'name', 'first', 'last', 'compas_screening_date', 'sex', 'dob',
       'age', 'age_cat', 'race', 'juv_fel_count', 'decile_score',
       'juv_misd_count', 'juv_other_count', 'priors_count',
       'days_b_screening_arrest', 'c_jail_in', 'c_jail_out', 'c_case_number',
       'c_offense_date', 'c_arrest_date', 'c_days_from_compas',
       'c_charge_degree', 'c_charge_desc', 'is_recid', 'r_case_number',
       'r_charge_degree', 'r_days_from_arrest', 'r_offense_date',
       'r_charge_desc', 'r_jail_in', 'r_jail_out', 'violent_recid',
       'is_violent_recid', 'vr_case_number', 'vr_charge_degree',
       'vr_offense_date', 'vr_charge_desc', 'type_of_assessment',
       'decile_score.1', 'score_text', 'screening_date',
       'v_type_of_assessment', 'v_decile_score', 'v_score_text',
       'v_screening_date', 'in_custody', 'out_custody', 'priors_count.1',
       'start', 'end', 'event', 'two_year_recid', 'two_year_recid.1'],
      dtype='object')

In [74]:
compas_df = raw_data.loc[
    (raw_data['days_b_screening_arrest'] <= 30) &
    (raw_data['days_b_screening_arrest'] >= -30) &
    (raw_data['is_recid'] != -1) &
    (raw_data['c_charge_degree'] != "O") &
    (raw_data['score_text'] != "N/A")
]
len(compas_df)

4020

## Recidivism Classification

### Prepare Data

In [75]:
df = compas_df.copy()

#### Handle simple bias and unbalance. 
we can clearly see that the sample is unbalanced: Asian and Native Amerian samples are too small -> inapprociate to analyze and make predictions for them -> we'll add them to the Other group. This also makes the sample data more balanced.

In [76]:
df.loc[df['race'].isin(['Native American', 'Asian']), 'race'] = "Other"
df['race'].value_counts()

African-American    1918
Caucasian           1459
Hispanic             355
Other                288
Name: race, dtype: int64

In [77]:
features = df[['sex','age','race','priors_count','c_charge_degree','juv_fel_count', 'juv_misd_count', 'juv_other_count']]
target = df['two_year_recid']

In [78]:
# split train and test data
X_train, X_test, y_train, y_test = model_selection.train_test_split(
        features, 
        target, 
        # stratify = stratify,
        test_size = 0.3, 
        random_state = 42)

### Data Preprocessing

In [79]:
# classify data type
categorical_features = ['race','sex','c_charge_degree']
numerical_features = ['age','priors_count','juv_fel_count', 'juv_misd_count', 'juv_other_count']

In [80]:
# pipeline to transform data
categorical_transformer = Pipeline(steps = [('onehot', OneHotEncoder(drop = 'if_binary'))])
numerical_transformer = Pipeline(steps = [('scale', StandardScaler())])

In [81]:
preprocessor = ColumnTransformer(transformers = [
    ('cat_preprocess', categorical_transformer, categorical_features),
    ('num_preprocess', numerical_transformer, numerical_features)
])

### Define metrics and functions

In [91]:
def evaluate_classification_models(model, y_test, y_preds,train_time,pred_time):
        print(model)
        print("\t Training Time: %0.8f" % train_time)
        print("\t Prediction Time: %0.8f" % pred_time)
        print("\t Accuracy Score: %0.5f" % accuracy_score(y_test, y_preds))
        print("\t Classification Report","\n", classification_report(y_test, y_preds))
        print()

In [134]:
def run_model(classifiers):
    for c in classifiers:
        model = Pipeline(steps = [
        ('preprocessor', preprocessor),
        ('classifier', c)])
        start = time()
        accuracy_score = cross_val_score(model,X_train,y_train, cv=3, scoring ='accuracy')
        f1_score = cross_val_score(model,X_train,y_train, cv=3, scoring ='f1')
        train_time = time() - start
        print(c)
        print("\t Training Time: %0.8f" % train_time)
        print("\t AVG Accuracy Score: %0.5f" % accuracy_score.mean())
        print("\t AVG F1 Score: %0.5f" % f1_score.mean())


### Baseline Model

In [135]:
baseline_model = [LogisticRegression(random_state=42)]
run_model(baseline_model)

LogisticRegression(random_state=42)
	 Training Time: 0.12617302
	 AVG Accuracy Score: 0.84115
	 AVG F1 Score: 0.17133


### Model Comparison & Selection

In [141]:
classifers = [
    DecisionTreeClassifier(random_state=42),
    RandomForestClassifier(random_state=42),
]

In [137]:
run_model(classifers)

DecisionTreeClassifier(random_state=42)
	 Training Time: 0.07846189
	 AVG Accuracy Score: 0.79211
	 AVG F1 Score: 0.27640
RandomForestClassifier(random_state=42)
	 Training Time: 0.60507512
	 AVG Accuracy Score: 0.80952
	 AVG F1 Score: 0.26550


**Our tree models aren't better than baseline logistic regression model -> need to tuning**

|Model          | Accuracy Score      |
| :------------ | :-----------: |
| **Baseline Logistic Regression**  | **84.115%**     |
| Decision Tree  | **79.211%**     |
| Random Forest | **80.952%**     

### Tunning Decision Tree

#### Randomized Parameter Tuning

In [160]:
from sklearn.model_selection import RandomizedSearchCV

dt_pipeline = Pipeline(steps = [
        ('preprocessor', preprocessor),
        ('model', DecisionTreeClassifier(random_state = 42))
])
dt_param_dis = {
    'model__criterion': ['gini','entropy'],
    'model__splitter': ['best','random'],
    'model__max_depth': [2,5,10,20,40,None],
    'model__min_samples_split': [2,5,10,15],
    'model__max_features': ['auto','sqrt','log2',None]}

rs_dt = RandomizedSearchCV(
        estimator = dt_pipeline, 
        param_distributions = dt_param_dis, 
        n_iter =100, 
        cv = 3, 
        random_state = 42, 
        scoring ='accuracy')

In [158]:
rs_dt.fit(X_train,y_train)
print('best stcore = ' + str(rs_dt.best_score_))
print('best params = ' + str(rs_dt.best_params_))

best stcore = 0.8386638237384506
best params = {'model__splitter': 'random', 'model__min_samples_split': 5, 'model__max_features': 'auto', 'model__max_depth': 2, 'model__criterion': 'entropy'}


#### GridSearch

In [161]:
from sklearn.model_selection import GridSearchCV


gs_dt = GridSearchCV(
        estimator = dt_pipeline, 
        param_grid = dt_param_dis, 
        cv = 3,
        scoring ='accuracy')

In [162]:
gs_dt.fit(X_train,y_train)
print('best stcore = ' + str(gs_dt.best_score_))
print('best params = ' + str(gs_dt.best_params_))

best stcore = 0.8397299218194741
best params = {'model__criterion': 'entropy', 'model__max_depth': 5, 'model__max_features': None, 'model__min_samples_split': 15, 'model__splitter': 'random'}


### Tunning Random Forest

### Randomized Parameter Tuning

In [175]:
rf_pipeline = Pipeline(steps = [
        ('preprocessor', preprocessor),
        ('model', RandomForestClassifier(random_state = 42))
])

rf_param_dis = {
    'model__n_estimators': [int(x) for x in np.linspace(start = 50, stop = 300, num = 10)],
    'model__max_features': ['auto', 'sqrt'],
    'model__max_depth': [int(x) for x in np.linspace(60, 200, num = 11)]+[None],
    'model__min_samples_split': [10, 15, 20],
    'model__min_samples_leaf': [1, 2, 4],
    'model__bootstrap': [True, False]}

rs_rf = RandomizedSearchCV(
    estimator = rf_pipeline, 
    param_distributions = rf_param_dis, 
    n_iter = 100, 
    cv = 3, 
    random_state = 42, 
    scoring ='accuracy')

In [176]:
start = time()
rs_rf.fit(X_train,y_train)
print('best stcore = ' + str(rs_rf.best_score_))
print('best params = ' + str(rs_rf.best_params_))
train_time = time() - start
print(train_time)

best stcore = 0.8400852878464818
best params = {'model__n_estimators': 77, 'model__min_samples_split': 20, 'model__min_samples_leaf': 4, 'model__max_features': 'auto', 'model__max_depth': 144, 'model__bootstrap': True}
52.80803990364075


In [177]:
rf_param_gs = {
    'model__n_estimators': [int(x) for x in np.linspace(start = 60, stop = 80, num = 5)],
    'model__max_features': ['auto', 'sqrt'],
    'model__max_depth': [int(x) for x in np.linspace(100, 200, num = 10)]+[None],
    'model__min_samples_split': [20, 25, 30],
    'model__min_samples_leaf': [4, 6, 8],
    'model__bootstrap': [True, False]}

In [178]:
gs_dt = GridSearchCV(
        estimator = rf_pipeline, 
        param_grid = rf_param_gs, 
        cv = 3,
        scoring ='accuracy')

In [179]:
start = time()
gs_dt.fit(X_train,y_train)
print('best stcore = ' + str(gs_dt.best_score_))
print('best params = ' + str(gs_dt.best_params_))
train_time = time() - start
print(train_time)

best stcore = 0.8422174840085287
best params = {'model__bootstrap': True, 'model__max_depth': 100, 'model__max_features': 'auto', 'model__min_samples_leaf': 6, 'model__min_samples_split': 25, 'model__n_estimators': 60}
391.5878019332886


In [181]:
rf_param_gs = {
    'model__n_estimators': [int(x) for x in np.linspace(start = 50, stop = 200, num = 5)],
    'model__max_features': ['auto', 'sqrt'],
    'model__max_depth': [int(x) for x in np.linspace(100, 200, num = 10)]+[None],
    'model__min_samples_split': [int(x) for x in np.linspace(6, 20, num = 1)]+[None],
    'model__min_samples_leaf': [int(x) for x in np.linspace(20, 200, num = 10)]+[None],
    'model__bootstrap': [True, False]
}

In [182]:
start = time()
gs_dt.fit(X_train,y_train)
print('best stcore = ' + str(gs_dt.best_score_))
print('best params = ' + str(gs_dt.best_params_))
train_time = time() - start
print(train_time)

best stcore = 0.8422174840085287
best params = {'model__bootstrap': True, 'model__max_depth': 100, 'model__max_features': 'auto', 'model__min_samples_leaf': 6, 'model__min_samples_split': 25, 'model__n_estimators': 60}
392.04707312583923


#### Compare models against the Test Set

In [None]:
nb_pred = nb.predict(X_test)
ens_pred = ens.predict(X_test)
dt_pred = dt_voting.predict(X_test)
ens_stack_pred = ens_stack.predict(X_test)
rf_pred = rf_est.predict(X_test)

In [198]:
def test_model(classifiers):
    for c in classifiers:
        model = Pipeline(steps = [
        ('preprocessor', preprocessor),
        ('classifier', c)])
        start = time()
        model.fit(X_train, y_train)
        y_preds = model.predict(X_test)
        test_time = time() - start
        print(c)
        print("\t Testing Time: %0.8f" % test_time)
        print("\t AVG Accuracy Score: %0.5f" % accuracy_score(y_test, y_preds))
        print("\t AVG F1 Score: %0.5f" % f1_score(y_test, y_preds))
        print()


In [191]:
test_model(baseline_model)

LogisticRegression(random_state=42)
	 Testing Time: 0.05517220
	 AVG Accuracy Score: 0.84494
	 AVG F1 Score: 0.18341


In [196]:
tunned_classifers =[
    DecisionTreeClassifier(
        criterion = 'entropy', 
        max_depth = 5, 
        max_features = None,
        min_samples_split = 15, 
        splitter = 'random'
    ),
    RandomForestClassifier(
        n_estimators = 60,
        max_depth = 100,
        max_features = 'auto',
        bootstrap = True,
        min_samples_split = 25,
        min_samples_leaf = 6
    )
]
        
    

In [199]:
test_model(tunned_classifers)

DecisionTreeClassifier(criterion='entropy', max_depth=5, min_samples_split=15,
                       splitter='random')
	 Testing Time: 0.01809406
	 AVG Accuracy Score: 0.83665
	 AVG F1 Score: 0.23346

RandomForestClassifier(max_depth=100, max_features='auto', min_samples_leaf=6,
                       min_samples_split=25, n_estimators=60)
	 Testing Time: 0.08935714
	 AVG Accuracy Score: 0.84577
	 AVG F1 Score: 0.22500

