In [151]:
import numpy as np
import pandas as pd
import sklearn
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn import metrics

In [152]:
from os.path import basename, exists

def download(url):
    filename = basename(url)
    if not exists(filename):
        from urllib.request import urlretrieve
        local, _ = urlretrieve(url, filename)
        print('Downloaded ' + local)

download('https://github.com/propublica/compas-analysis/raw/master/' +
         'compas-scores-two-years.csv')
df = pd.read_csv('compas-scores-two-years.csv')
df.shape

(7214, 53)

In [153]:
df.head()

Unnamed: 0,id,name,first,last,compas_screening_date,sex,dob,age,age_cat,race,...,v_decile_score,v_score_text,v_screening_date,in_custody,out_custody,priors_count.1,start,end,event,two_year_recid
0,1,miguel hernandez,miguel,hernandez,2013-08-14,Male,1947-04-18,69,Greater than 45,Other,...,1,Low,2013-08-14,2014-07-07,2014-07-14,0,0,327,0,0
1,3,kevon dixon,kevon,dixon,2013-01-27,Male,1982-01-22,34,25 - 45,African-American,...,1,Low,2013-01-27,2013-01-26,2013-02-05,0,9,159,1,1
2,4,ed philo,ed,philo,2013-04-14,Male,1991-05-14,24,Less than 25,African-American,...,3,Low,2013-04-14,2013-06-16,2013-06-16,4,0,63,0,1
3,5,marcu brown,marcu,brown,2013-01-13,Male,1993-01-21,23,Less than 25,African-American,...,6,Medium,2013-01-13,,,1,0,1174,0,0
4,6,bouthy pierrelouis,bouthy,pierrelouis,2013-03-26,Male,1973-01-22,43,25 - 45,Other,...,1,Low,2013-03-26,,,2,0,1102,0,0


- If the charge date of a defendants Compas scored crime was not within 30 days from when the person was arrested, we assume that because of data quality reasons, that we do not have the right offense.
- We coded the recidivist flag -- is_recid -- to be -1 if we could not find a compas case at all.
- In a similar vein, ordinary traffic offenses -- those with a c_charge_degree of 'O' -- will not result in Jail time are removed
- We filtered the underlying data from Broward county to include only those rows representing people who had either recidivated in two years, or had at least two years outside of a correctional facility.

―[NorthPointe](https://github.com/propublica/compas-analysis/blob/master/Compas%20Analysis.ipynb)

In [154]:
df = df[(df["is_recid"] != -1) & (df["days_b_screening_arrest"] <= 30) & (df["days_b_screening_arrest"] >= -30) & (df["c_charge_degree"] != "O") & (df["score_text"] != "N/A")]
df.shape

(6172, 53)

**Part I: Analysis First**

In [155]:
# COMPAS recidivism confusion matrix
df['guessed_recid'] = df['score_text'] != 'Low'
df['actual_recid'] = df.two_year_recid == 1
cm = pd.crosstab(df.actual_recid, df.guessed_recid)
cm # for "confusion matrix"

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until


guessed_recid,False,True
actual_recid,Unnamed: 1_level_1,Unnamed: 2_level_1
False,2345,1018
True,1076,1733


In [156]:
# The usual definitions. First index is predicted, second is actual
TN = cm[False][False]
TP = cm[True][True]
FN = cm[False][True]
FP = cm[True][False]

About 63% of those scored as medium or high risk end up getting arrested again within two years. This is the **Positive Predictive Value (PPV)** or **True Positive(TP)**. 

In [157]:
# PPV
TP / (TP + FP)

0.6299527444565612

Of those who did not go on to be re-arrested, about 30% were classified as medium or high risk. This is the **False Positive Rate (FPR)**.
This is also known as the "**type I Error**" in statistics.

In [158]:
# FPR
FP / (FP + TN)

0.30270591733571217

We can also calculate the **False Negative Rate (FNR)** : when the value is actually positive but we classified them as negative. In this case, it counts those who were classified as low risk, as a fraction of those who were re-arrested.

Called "**type II error**" in statistic.

In [159]:
# FNR
FN / P

0.38305446778212887

To study the difference betwen races, let's define a few helper functions.



In [160]:
# cm is a confusion matrix. The rows are guessed, the columns are actual 
def print_ppv_fpv(cm):
    # the indices here are [col][row] or [actual][guessed]
    TN = cm[False][False]   
    TP = cm[True][True]
    FN = cm[True][False]
    FP = cm[False][True]
    print('Accuracy: ', (TN+TP)/(TN+TP+FN+FP))
    print('PPV: ', TP / (TP + FP))
    print('FPR: ', FP / (FP + TN))
    print('FNR: ', FN / (FN + TP))
    print()

def print_metrics(guessed, actual):
    cm = pd.crosstab(guessed, actual, rownames=['guessed'], colnames=['actual'])
    print(cm)
    print()
    print_ppv_fpv(cm)  

In [161]:
print('White')
subset = df[df.race == 'Caucasian']
print_metrics(subset.guessed_recid, subset.actual_recid)

print('Black')
subset = df[df.race == 'African-American']
print_metrics(subset.guessed_recid, subset.actual_recid)

White
actual   False  True
guessed             
False      999   408
True       282   414

Accuracy:  0.6718972895863052
PPV:  0.5948275862068966
FPR:  0.22014051522248243
FNR:  0.49635036496350365

Black
actual   False  True
guessed             
False      873   473
True       641  1188

Accuracy:  0.6491338582677165
PPV:  0.6495352651722253
FPR:  0.4233817701453104
FNR:  0.2847682119205298



**Part II: Building My Own Predictor**

Start with some feature engineering 

In [162]:
sex_mapDict={'Male':1,
            'Female':0}
df['sex']=df['sex'].map(sex_mapDict)
df.head()

Unnamed: 0,id,name,first,last,compas_screening_date,sex,dob,age,age_cat,race,...,v_screening_date,in_custody,out_custody,priors_count.1,start,end,event,two_year_recid,guessed_recid,actual_recid
0,1,miguel hernandez,miguel,hernandez,2013-08-14,1,1947-04-18,69,Greater than 45,Other,...,2013-08-14,2014-07-07,2014-07-14,0,0,327,0,0,False,False
1,3,kevon dixon,kevon,dixon,2013-01-27,1,1982-01-22,34,25 - 45,African-American,...,2013-01-27,2013-01-26,2013-02-05,0,9,159,1,1,False,True
2,4,ed philo,ed,philo,2013-04-14,1,1991-05-14,24,Less than 25,African-American,...,2013-04-14,2013-06-16,2013-06-16,4,0,63,0,1,False,True
5,7,marsha miles,marsha,miles,2013-11-30,1,1971-08-22,44,25 - 45,Other,...,2013-11-30,2013-11-30,2013-12-01,0,1,853,0,0,False,False
6,8,edward riddle,edward,riddle,2014-02-19,1,1974-07-23,41,25 - 45,Caucasian,...,2014-02-19,2014-03-31,2014-04-18,14,5,40,1,1,True,True


In [163]:
age_cat_Df = pd.DataFrame()
age_cat_Df = pd.get_dummies( df['age_cat'] , prefix='age_cat' )
age_cat_Df.head()

Unnamed: 0,age_cat_25 - 45,age_cat_Greater than 45,age_cat_Less than 25
0,0,1,0
1,1,0,0
2,0,0,1
5,1,0,0
6,1,0,0


Felony crimes can be punishable by life or in certain states capital crimes are punishable by death. Minor offences are considered misdemeanors.

In [164]:
sex_mapDict={'F':1,
            'M':0}
df['c_charge_degree']=df['c_charge_degree'].map(sex_mapDict)
df.head()

Unnamed: 0,id,name,first,last,compas_screening_date,sex,dob,age,age_cat,race,...,v_screening_date,in_custody,out_custody,priors_count.1,start,end,event,two_year_recid,guessed_recid,actual_recid
0,1,miguel hernandez,miguel,hernandez,2013-08-14,1,1947-04-18,69,Greater than 45,Other,...,2013-08-14,2014-07-07,2014-07-14,0,0,327,0,0,False,False
1,3,kevon dixon,kevon,dixon,2013-01-27,1,1982-01-22,34,25 - 45,African-American,...,2013-01-27,2013-01-26,2013-02-05,0,9,159,1,1,False,True
2,4,ed philo,ed,philo,2013-04-14,1,1991-05-14,24,Less than 25,African-American,...,2013-04-14,2013-06-16,2013-06-16,4,0,63,0,1,False,True
5,7,marsha miles,marsha,miles,2013-11-30,1,1971-08-22,44,25 - 45,Other,...,2013-11-30,2013-11-30,2013-12-01,0,1,853,0,0,False,False
6,8,edward riddle,edward,riddle,2014-02-19,1,1974-07-23,41,25 - 45,Caucasian,...,2014-02-19,2014-03-31,2014-04-18,14,5,40,1,1,True,True


In [165]:
age_cat_Df = pd.DataFrame()
age_cat_Df = pd.get_dummies( df['age_cat'] , prefix='age_cat' )
age_cat_Df.head()

Unnamed: 0,age_cat_25 - 45,age_cat_Greater than 45,age_cat_Less than 25
0,0,1,0
1,1,0,0
2,0,0,1
5,1,0,0
6,1,0,0


In [166]:
df = pd.concat([df,age_cat_Df],axis=1)
df.drop('age_cat',axis=1,inplace=True)
df.head()

Unnamed: 0,id,name,first,last,compas_screening_date,sex,dob,age,race,juv_fel_count,...,priors_count.1,start,end,event,two_year_recid,guessed_recid,actual_recid,age_cat_25 - 45,age_cat_Greater than 45,age_cat_Less than 25
0,1,miguel hernandez,miguel,hernandez,2013-08-14,1,1947-04-18,69,Other,0,...,0,0,327,0,0,False,False,0,1,0
1,3,kevon dixon,kevon,dixon,2013-01-27,1,1982-01-22,34,African-American,0,...,0,9,159,1,1,False,True,1,0,0
2,4,ed philo,ed,philo,2013-04-14,1,1991-05-14,24,African-American,0,...,4,0,63,0,1,False,True,0,0,1
5,7,marsha miles,marsha,miles,2013-11-30,1,1971-08-22,44,Other,0,...,0,1,853,0,0,False,False,1,0,0
6,8,edward riddle,edward,riddle,2014-02-19,1,1974-07-23,41,Caucasian,0,...,14,5,40,1,1,True,True,1,0,0


In [167]:
score_text_Df = pd.DataFrame()
score_text_Df = pd.get_dummies( df['score_text'] , prefix='score_text' )
score_text_Df.head()

Unnamed: 0,score_text_High,score_text_Low,score_text_Medium
0,0,1,0
1,0,1,0
2,0,1,0
5,0,1,0
6,0,0,1


In [168]:
df = pd.concat([df,score_text_Df],axis=1)
df.drop('score_text',axis=1,inplace=True)
df.head()

Unnamed: 0,id,name,first,last,compas_screening_date,sex,dob,age,race,juv_fel_count,...,event,two_year_recid,guessed_recid,actual_recid,age_cat_25 - 45,age_cat_Greater than 45,age_cat_Less than 25,score_text_High,score_text_Low,score_text_Medium
0,1,miguel hernandez,miguel,hernandez,2013-08-14,1,1947-04-18,69,Other,0,...,0,0,False,False,0,1,0,0,1,0
1,3,kevon dixon,kevon,dixon,2013-01-27,1,1982-01-22,34,African-American,0,...,1,1,False,True,1,0,0,0,1,0
2,4,ed philo,ed,philo,2013-04-14,1,1991-05-14,24,African-American,0,...,0,1,False,True,0,0,1,0,1,0
5,7,marsha miles,marsha,miles,2013-11-30,1,1971-08-22,44,Other,0,...,0,0,False,False,1,0,0,0,1,0
6,8,edward riddle,edward,riddle,2014-02-19,1,1974-07-23,41,Caucasian,0,...,1,1,True,True,1,0,0,0,0,1


In [169]:
v_score_text_Df = pd.DataFrame()
v_score_text_Df = pd.get_dummies( df['v_score_text'] , prefix='v_score_text' )
v_score_text_Df.head()

Unnamed: 0,v_score_text_High,v_score_text_Low,v_score_text_Medium
0,0,1,0
1,0,1,0
2,0,1,0
5,0,1,0
6,0,1,0


In [170]:
df = pd.concat([df,v_score_text_Df],axis=1)
df.drop('v_score_text',axis=1,inplace=True)
df.head()

Unnamed: 0,id,name,first,last,compas_screening_date,sex,dob,age,race,juv_fel_count,...,actual_recid,age_cat_25 - 45,age_cat_Greater than 45,age_cat_Less than 25,score_text_High,score_text_Low,score_text_Medium,v_score_text_High,v_score_text_Low,v_score_text_Medium
0,1,miguel hernandez,miguel,hernandez,2013-08-14,1,1947-04-18,69,Other,0,...,False,0,1,0,0,1,0,0,1,0
1,3,kevon dixon,kevon,dixon,2013-01-27,1,1982-01-22,34,African-American,0,...,True,1,0,0,0,1,0,0,1,0
2,4,ed philo,ed,philo,2013-04-14,1,1991-05-14,24,African-American,0,...,True,0,0,1,0,1,0,0,1,0
5,7,marsha miles,marsha,miles,2013-11-30,1,1971-08-22,44,Other,0,...,False,1,0,0,0,1,0,0,1,0
6,8,edward riddle,edward,riddle,2014-02-19,1,1974-07-23,41,Caucasian,0,...,True,1,0,0,0,0,1,0,1,0


In [171]:
df.shape
df.head()

Unnamed: 0,id,name,first,last,compas_screening_date,sex,dob,age,race,juv_fel_count,...,actual_recid,age_cat_25 - 45,age_cat_Greater than 45,age_cat_Less than 25,score_text_High,score_text_Low,score_text_Medium,v_score_text_High,v_score_text_Low,v_score_text_Medium
0,1,miguel hernandez,miguel,hernandez,2013-08-14,1,1947-04-18,69,Other,0,...,False,0,1,0,0,1,0,0,1,0
1,3,kevon dixon,kevon,dixon,2013-01-27,1,1982-01-22,34,African-American,0,...,True,1,0,0,0,1,0,0,1,0
2,4,ed philo,ed,philo,2013-04-14,1,1991-05-14,24,African-American,0,...,True,0,0,1,0,1,0,0,1,0
5,7,marsha miles,marsha,miles,2013-11-30,1,1971-08-22,44,Other,0,...,False,1,0,0,0,1,0,0,1,0
6,8,edward riddle,edward,riddle,2014-02-19,1,1974-07-23,41,Caucasian,0,...,True,1,0,0,0,0,1,0,1,0


Finding correlation 

In [172]:
corrDf = df.corr() 
corrDf

Unnamed: 0,id,sex,age,juv_fel_count,decile_score,juv_misd_count,juv_other_count,priors_count,days_b_screening_arrest,c_days_from_compas,...,actual_recid,age_cat_25 - 45,age_cat_Greater than 45,age_cat_Less than 25,score_text_High,score_text_Low,score_text_Medium,v_score_text_High,v_score_text_Low,v_score_text_Medium
id,1.0,-0.033748,-0.017994,0.009988,0.005631,-0.004422,0.015342,-0.006252,0.008677,0.000438,...,0.02011,0.014098,-0.020532,0.003341,-0.015933,-0.015832,0.03204,-0.00811,-0.007553,0.013926
sex,-0.033748,1.0,-0.008407,0.050458,0.060575,0.050718,0.058767,0.118722,0.030453,0.005721,...,0.100911,-0.01384,0.006243,0.010427,0.070934,-0.039625,-0.01793,0.082033,-0.105279,0.060096
age,-0.017994,-0.008407,1.0,-0.047753,-0.403709,-0.111591,-0.172787,0.119773,-0.068926,0.073493,...,-0.18918,-0.216336,0.828521,-0.557148,-0.207713,0.309393,-0.166505,-0.234126,0.46606,-0.354674
juv_fel_count,0.009988,0.050458,-0.047753,1.0,0.166167,0.079766,0.043438,0.194072,0.024331,-0.005396,...,0.081715,0.029354,-0.047818,0.01195,0.167444,-0.12226,-0.009793,0.119855,-0.129891,0.061336
decile_score,0.005631,0.060575,-0.403709,0.166167,1.0,0.217722,0.194605,0.44783,0.085236,-0.000672,...,0.365487,0.032739,-0.307958,0.26419,0.752665,-0.870335,0.319304,0.477984,-0.643945,0.383912
juv_misd_count,-0.004422,0.050718,-0.111591,0.079766,0.217722,1.0,0.257007,0.267675,0.026999,0.009993,...,0.110298,0.024876,-0.092727,0.06156,0.219977,-0.167662,-0.004886,0.164266,-0.165429,0.070136
juv_other_count,0.015342,0.058767,-0.172787,0.043438,0.194605,0.257007,1.0,0.108757,0.023331,-0.00583,...,0.12591,-0.06669,-0.110879,0.189121,0.163351,-0.161756,0.038567,0.154503,-0.185225,0.09874
priors_count,-0.006252,0.118722,0.119773,0.194072,0.44783,0.267675,0.108757,1.0,0.02135,0.003762,...,0.290607,0.128317,0.058449,-0.211281,0.371307,-0.38065,0.102353,0.14752,-0.166962,0.083336
days_b_screening_arrest,0.008677,0.030453,-0.068926,0.024331,0.085236,0.026999,0.023331,0.02135,1.0,0.006457,...,0.074836,0.02716,-0.064668,0.031181,0.067811,-0.070429,0.019725,0.055998,-0.073994,0.043377
c_days_from_compas,0.000438,0.005721,0.073493,-0.005396,-0.000672,0.009993,-0.00583,0.003762,0.006457,1.0,...,-0.039899,-0.023824,0.067325,-0.037794,0.005067,-0.00778,0.004325,-0.012989,0.022535,-0.016004


In [173]:
corrDf['two_year_recid'].sort_values(ascending =False)

actual_recid               1.000000
two_year_recid             1.000000
is_recid                   0.942816
event                      0.793184
decile_score               0.365487
decile_score.1             0.365487
is_violent_recid           0.347574
guessed_recid              0.314832
v_decile_score             0.308322
priors_count.1             0.290607
priors_count               0.290607
score_text_High            0.273244
v_score_text_High          0.195877
v_score_text_Medium        0.140141
juv_other_count            0.125910
c_charge_degree            0.120332
score_text_Medium          0.114638
age_cat_Less than 25       0.111027
juv_misd_count             0.110298
sex                        0.100911
start                      0.083427
juv_fel_count              0.081715
days_b_screening_arrest    0.074836
r_days_from_arrest         0.046149
age_cat_25 - 45            0.022041
id                         0.020110
c_days_from_compas        -0.039899
age_cat_Greater than 45   -0

Features that I chose

In [174]:
df_X = pd.concat( [v_score_text_Df,
                   score_text_Df,
                   age_cat_Df,
                   df['juv_fel_count'],
                   df['juv_misd_count'],
                   df['juv_other_count'],
                   df['priors_count'],
                   df['c_charge_degree'],
                   df['sex']
                    ] , axis=1 )
df_X.head()

Unnamed: 0,v_score_text_High,v_score_text_Low,v_score_text_Medium,score_text_High,score_text_Low,score_text_Medium,age_cat_25 - 45,age_cat_Greater than 45,age_cat_Less than 25,juv_fel_count,juv_misd_count,juv_other_count,priors_count,c_charge_degree,sex
0,0,1,0,0,1,0,0,1,0,0,0,0,0,1,1
1,0,1,0,0,1,0,1,0,0,0,0,0,0,1,1
2,0,1,0,0,1,0,0,0,1,0,0,1,4,1,1
5,0,1,0,0,1,0,1,0,0,0,0,0,0,0,1
6,0,1,0,0,0,1,1,0,0,0,0,0,14,1,1


In [175]:
df_X.shape

(6172, 15)

In [176]:
sourceRow=6172

#We -1 because computer always start from 0
#Source Dataset: feature
source_X = df_X.loc[0:sourceRow-1,:]

#Source Dataset: label
source_y = df.loc[0:sourceRow-1,'two_year_recid']   

#Pridiction Dataset: feature
pred_X = df_X.loc[sourceRow:,:]

print('There are:',source_X.shape[0],' sets of data for trainning.')

print('And',pred_X.shape[0], ' sets of data for testing.')

#80% trainning 20% testing
train_X, test_X, train_y, test_y = train_test_split(source_X ,source_y, train_size=.8)


There are: 5286  sets of data for trainning.
And 886  sets of data for testing.


In [177]:
#create the model: logisic regression
model = LogisticRegression()

#train the model
model.fit( train_X , train_y )

lr=LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [178]:
model.score(test_X , test_y )

0.7051039697542533

In [179]:
x = df_X.values
target = df.two_year_recid
y = target.values
lr.fit(x,y)

LogisticRegression(multi_class='ovr', n_jobs=1, solver='liblinear')

In [180]:
coeffs = pd.DataFrame(np.exp(lr.coef_), columns=df_X.columns)
coeffs

Unnamed: 0,v_score_text_High,v_score_text_Low,v_score_text_Medium,score_text_High,score_text_Low,score_text_Medium,age_cat_25 - 45,age_cat_Greater than 45,age_cat_Less than 25,juv_fel_count,juv_misd_count,juv_other_count,priors_count,c_charge_degree,sex
0,1.15445,0.751469,0.811907,1.350147,0.559768,0.931973,0.913789,0.566157,1.361475,0.982264,0.927699,1.247575,1.132597,1.189446,1.3871
