In [59]:
## Basic Modules to Import in Each Notebook

%matplotlib inline
import matplotlib.pyplot as plt
import pandas as pd
import os

In [60]:
df = pd.read_csv("combined2.csv")
df.head()

Unnamed: 0,id,sex,age,age_cat,race,juve_fel_count,decile_score,juv_misd_count,juv_other_count,prios_count,...,decile_score_a,score_text,v_type_of_assessment,v_decile_score_v,v_score_text,priors_count,start,end_,event,two_year_recid
0,1,Male,69,Greater than 45,Other,0,1,0,0,0,...,1,Low,Risk of Violence,1,Low,0,0,327,0,0
1,3,Male,34,25 - 45,African-American,0,3,0,0,0,...,3,Low,Risk of Violence,1,Low,0,9,159,1,1
2,4,Male,24,Less than 25,African-American,0,4,0,1,4,...,4,Low,Risk of Violence,3,Low,4,0,63,0,1
3,5,Male,23,Less than 25,African-American,0,8,1,0,1,...,8,High,Risk of Violence,6,Medium,1,0,1174,0,0
4,6,Male,43,25 - 45,Other,0,1,0,0,2,...,1,Low,Risk of Violence,1,Low,2,0,1102,0,0


In [None]:
for col in df.columns: 
    print(col)

In [98]:
reducedDF = df[['age', 'age_cat', 'race', 'juve_fel_count', 'juv_misd_count', 'prios_count', 'violence', 'drugs', 'theft', 'arrest_case_no_charge', 'traffic', 'disorderly_conduct', 'sexual_misconduct', 'priors_count', 'two_year_recid']]
reducedDF.head()

Unnamed: 0,age,age_cat,race,juve_fel_count,juv_misd_count,prios_count,violence,drugs,theft,arrest_case_no_charge,traffic,disorderly_conduct,sexual_misconduct,priors_count,two_year_recid
0,69,Greater than 45,Other,0,0,0,7,0,0,0,0,0,0,0,0
1,34,25 - 45,African-American,0,0,0,7,0,0,0,0,0,0,0,1
2,24,Less than 25,African-American,0,0,4,0,6,0,0,0,0,0,4,1
3,23,Less than 25,African-American,0,1,1,0,6,0,0,0,0,0,1,0
4,43,25 - 45,Other,0,0,2,0,0,0,4,0,0,0,2,0


In [99]:
reduced_without_race = df[['age', 'age_cat', 'juve_fel_count', 'juv_misd_count', 'prios_count', 'violence', 'drugs', 'theft', 'arrest_case_no_charge', 'traffic', 'disorderly_conduct', 'sexual_misconduct', 'priors_count', 'two_year_recid']]

In [100]:
reduced_without_race_dummies = pd.get_dummies(reduced_without_race)
reduced_without_race_dummies.head()

Unnamed: 0,age,juve_fel_count,juv_misd_count,prios_count,violence,drugs,theft,arrest_case_no_charge,traffic,disorderly_conduct,sexual_misconduct,priors_count,two_year_recid,age_cat_25 - 45,age_cat_Greater than 45,age_cat_Less than 25
0,69,0,0,0,7,0,0,0,0,0,0,0,0,0,1,0
1,34,0,0,0,7,0,0,0,0,0,0,0,1,1,0,0
2,24,0,0,4,0,6,0,0,0,0,0,4,1,0,0,1
3,23,0,1,1,0,6,0,0,0,0,0,1,0,0,0,1
4,43,0,0,2,0,0,0,4,0,0,0,2,0,1,0,0


## Train / Test on Entire Data Without "Bias"

In [101]:
X = reduced_without_race_dummies[['juve_fel_count', 'priors_count', 'violence', 'drugs', 'theft', 'arrest_case_no_charge', 'traffic', 'disorderly_conduct', 'sexual_misconduct', 'age_cat_25 - 45', 'age_cat_Greater than 45', 'age_cat_Less than 25', 'juv_misd_count', 'prios_count']]
y = reduced_without_race_dummies['two_year_recid']

In [102]:
from sklearn.model_selection import train_test_split
from sklearn import tree
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

In [103]:
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(n_estimators=1000)
rf = rf.fit(X_train, y_train)
rf.score(X_test, y_test)

0.6549805230940456

In [104]:
print(f"Training Data Score: {rf.score(X_train, y_train)}")
print(f"Testing Data Score: {rf.score(X_test, y_test)}")

Training Data Score: 0.7117668893838159
Testing Data Score: 0.6549805230940456


In [105]:
feature_names = X.columns
sorted(zip(rf.feature_importances_, feature_names), reverse=True)

[(0.2965388572225831, 'prios_count'),
 (0.28475514557962417, 'priors_count'),
 (0.06648383372433121, 'juv_misd_count'),
 (0.06358474635543848, 'age_cat_Less than 25'),
 (0.05570766485482983, 'juve_fel_count'),
 (0.055015195527456394, 'age_cat_Greater than 45'),
 (0.033686675830720963, 'age_cat_25 - 45'),
 (0.02597425245927496, 'arrest_case_no_charge'),
 (0.025185404807361995, 'drugs'),
 (0.022469959656816144, 'violence'),
 (0.022370714300446883, 'theft'),
 (0.021956968908356977, 'traffic'),
 (0.019910316489977063, 'disorderly_conduct'),
 (0.006360264282781583, 'sexual_misconduct')]

## Trying to see if we can train without race then look at confusion matrices after with race joined back onto it, by id??  Wednesday Work Start 

## Confusion Matrices 

In [106]:
predictions = rf.predict(X_test)
X_test.head()

Unnamed: 0,juve_fel_count,priors_count,violence,drugs,theft,arrest_case_no_charge,traffic,disorderly_conduct,sexual_misconduct,age_cat_25 - 45,age_cat_Greater than 45,age_cat_Less than 25,juv_misd_count,prios_count
1315,0,0,0,0,5,0,0,0,0,1,0,0,0,0
1893,0,14,0,0,0,0,3,0,0,1,0,0,0,14
6937,0,3,0,0,0,4,0,0,0,0,0,1,0,3
7021,0,0,7,0,0,0,0,0,0,0,1,0,0,0
4908,0,11,0,0,5,0,0,0,0,1,0,0,0,11


In [107]:
X_test['Prediction'] = predictions
X_test['Actual'] = y_test

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


In [108]:
X_test_see_no_race = X_test[['Prediction', 'Actual']]
X_test_see_no_race.head()

Unnamed: 0,Prediction,Actual
1315,0,1
1893,0,1
6937,1,0
7021,0,0
4908,1,1


In [109]:
joinedDF_trained_no_race = X_test_see_no_race.join(reducedDF, how='outer')
joinedDF_trained_no_race.head(20)

Unnamed: 0,Prediction,Actual,age,age_cat,race,juve_fel_count,juv_misd_count,prios_count,violence,drugs,theft,arrest_case_no_charge,traffic,disorderly_conduct,sexual_misconduct,priors_count,two_year_recid
0,,,69,Greater than 45,Other,0,0,0,7,0,0,0,0,0,0,0,0
1,,,34,25 - 45,African-American,0,0,0,7,0,0,0,0,0,0,0,1
2,,,24,Less than 25,African-American,0,0,4,0,6,0,0,0,0,0,4,1
3,,,23,Less than 25,African-American,0,1,1,0,6,0,0,0,0,0,1,0
4,,,43,25 - 45,Other,0,0,2,0,0,0,4,0,0,0,2,0
5,,,44,25 - 45,Other,0,0,0,7,0,0,0,0,0,0,0,0
6,,,41,25 - 45,Caucasian,0,0,14,0,0,5,0,0,0,0,14,1
7,,,43,25 - 45,Other,0,0,3,0,0,0,4,0,0,0,3,0
8,0.0,0.0,39,25 - 45,Caucasian,0,0,0,7,0,0,0,0,0,0,0,0
9,,,21,Less than 25,Caucasian,0,0,1,0,0,5,0,0,0,0,1,1


In [110]:
joinedDF_no_na = joinedDF_trained_no_race.dropna()
joinedDF_no_na.head()

Unnamed: 0,Prediction,Actual,age,age_cat,race,juve_fel_count,juv_misd_count,prios_count,violence,drugs,theft,arrest_case_no_charge,traffic,disorderly_conduct,sexual_misconduct,priors_count,two_year_recid
8,0.0,0.0,39,25 - 45,Caucasian,0,0,0,7,0,0,0,0,0,0,0,0
14,0.0,1.0,47,Greater than 45,Caucasian,0,0,1,0,0,0,4,0,0,0,1,1
15,0.0,1.0,31,25 - 45,African-American,0,0,7,0,0,0,0,3,0,0,7,1
17,1.0,0.0,25,25 - 45,African-American,0,0,3,0,0,5,0,0,0,0,3,0
19,1.0,1.0,31,25 - 45,Caucasian,0,0,5,0,6,0,0,0,0,0,5,1


## Confusion Matrix .loc by Caucasian - Trained and Tested on no Race

In [111]:
caucasian = joinedDF_no_na.loc[joinedDF_no_na['race'] == 'Caucasian']
caucasian_reduced = caucasian[['race','Prediction','Actual']]
caucasian_reduced.head()

Unnamed: 0,race,Prediction,Actual
8,Caucasian,0.0,0.0
14,Caucasian,0.0,1.0
19,Caucasian,1.0,1.0
23,Caucasian,0.0,0.0
31,Caucasian,1.0,1.0


In [112]:
from sklearn.metrics import confusion_matrix
y_true_caucasian = caucasian_reduced['Actual']
y_pred_caucasian = caucasian_reduced['Prediction']
confusion_matrix(y_true_caucasian, y_pred_caucasian)

array([[294,  72],
       [141, 104]], dtype=int64)

In [113]:
tn, fp, fn, tp = confusion_matrix(y_true_caucasian, y_pred_caucasian).ravel()
(tn, fp, fn, tp)

(294, 72, 141, 104)

In [114]:
total = tn + fp + fn + tp 
percent_fp = fp / total 
percent_fn = fn / total 
print(f'Confusion Matrix trained on no race for Caucasian: \nTotal: {total} \nFalse Positive: {percent_fp} \nFalse Negative: {percent_fn}')

Confusion Matrix trained on no race for Caucasian: 
Total: 611 
False Positive: 0.11783960720130933 
False Negative: 0.23076923076923078


## Confusion Matrix Loc by A-A - Trained and Tested on no Race

In [115]:
african_american = joinedDF_no_na.loc[joinedDF_no_na['race'] == 'African-American']
# caucasian.head()
african_american_reduced = african_american[['race','Prediction','Actual']]
african_american_reduced.head()

Unnamed: 0,race,Prediction,Actual
15,African-American,0.0,1.0
17,African-American,1.0,0.0
41,African-American,0.0,0.0
44,African-American,0.0,1.0
57,African-American,0.0,0.0


In [116]:
from sklearn.metrics import confusion_matrix
y_true_african_american = african_american_reduced['Actual']
y_pred_african_american = african_american_reduced['Prediction']
confusion_matrix(y_true_african_american, y_pred_african_american)

array([[330, 141],
       [186, 285]], dtype=int64)

In [117]:
tn, fp, fn, tp = confusion_matrix(y_true_african_american, y_pred_african_american).ravel()
(tn, fp, fn, tp)

(330, 141, 186, 285)

In [118]:
total = tn + fp + fn + tp 
percent_fp = fp / total 
percent_fn = fn / total 
print(f'Confusion Matrix trained on no race for African-American: \nTotal: {total} \nFalse Positive: {percent_fp} \nFalse Negative: {percent_fn}')

Confusion Matrix trained on no race for African-American: 
Total: 942 
False Positive: 0.14968152866242038 
False Negative: 0.19745222929936307


## Confusion Matrix Loc by Hispanic - Trained and Tested on no Race

In [119]:
hispanic = joinedDF_no_na.loc[joinedDF_no_na['race'] == 'Hispanic']
# caucasian.head()
hispanic_reduced = hispanic[['race','Prediction','Actual']]
hispanic_reduced.head()

Unnamed: 0,race,Prediction,Actual
156,Hispanic,0.0,0.0
168,Hispanic,0.0,0.0
176,Hispanic,0.0,0.0
177,Hispanic,0.0,0.0
185,Hispanic,0.0,0.0


In [120]:
from sklearn.metrics import confusion_matrix
y_true_hispanic = hispanic_reduced['Actual']
y_pred_hispanic = hispanic_reduced['Prediction']
confusion_matrix(y_true_hispanic, y_pred_hispanic)

array([[77, 12],
       [40, 19]], dtype=int64)

In [121]:
tn, fp, fn, tp = confusion_matrix(y_true_hispanic, y_pred_hispanic).ravel()
(tn, fp, fn, tp)

(77, 12, 40, 19)

In [122]:
total = tn + fp + fn + tp 
percent_fp = fp / total 
percent_fn = fn / total 
print(f'Confusion Matrix trained on no race for Hispanic: \nTotal: {total} \nFalse Positive: {percent_fp} \nFalse Negative: {percent_fn}')

Confusion Matrix trained on no race for Hispanic: 
Total: 148 
False Positive: 0.08108108108108109 
False Negative: 0.2702702702702703


## Confusion Matrix Loc by Native-American - Trained and Tested on no Race

In [123]:
native_american = joinedDF_no_na.loc[joinedDF_no_na['race'] == 'Native American']
# caucasian.head()
native_american_reduced = native_american[['race','Prediction','Actual']]
native_american_reduced.head()

Unnamed: 0,race,Prediction,Actual
457,Native American,0.0,1.0
1163,Native American,1.0,0.0
1703,Native American,0.0,0.0
1858,Native American,0.0,0.0
3639,Native American,1.0,1.0


In [124]:
from sklearn.metrics import confusion_matrix
y_true_native_american = native_american_reduced['Actual']
y_pred_native_american = native_american_reduced['Prediction']
confusion_matrix(y_true_native_american, y_pred_native_american)

array([[2, 1],
       [1, 1]], dtype=int64)

In [125]:
tn, fp, fn, tp = confusion_matrix(y_true_native_american, y_pred_native_american).ravel()
(tn, fp, fn, tp)

(2, 1, 1, 1)

In [126]:
total = tn + fp + fn + tp 
percent_fp = fp / total 
percent_fn = fn / total 
print(f'Confusion Matrix trained on no race for Native American: \nTotal: {total} \nFalse Positive: {percent_fp} \nFalse Negative: {percent_fn}')

Confusion Matrix trained on no race for Native American: 
Total: 5 
False Positive: 0.2 
False Negative: 0.2


## Confusion Matrix Loc by Other - Trained and Tested on no Race


In [127]:
other = joinedDF_no_na.loc[joinedDF_no_na['race'] == 'Other']
# caucasian.head()
other_reduced = other[['race','Prediction','Actual']]
other_reduced.head()

Unnamed: 0,race,Prediction,Actual
26,Other,0.0,0.0
51,Other,0.0,0.0
91,Other,1.0,1.0
131,Other,0.0,0.0
157,Other,0.0,0.0


In [128]:
from sklearn.metrics import confusion_matrix
y_true_other = other_reduced['Actual']
y_pred_other = other_reduced['Prediction']
confusion_matrix(y_true_other, y_pred_other)

array([[48,  6],
       [14, 13]], dtype=int64)

In [129]:
tn, fp, fn, tp = confusion_matrix(y_true_other, y_pred_other).ravel()
(tn, fp, fn, tp)

(48, 6, 14, 13)

In [130]:
total = tn + fp + fn + tp 
percent_fp = fp / total 
percent_fn = fn / total 
print(f'Confusion Matrix trained on no race for Other: \nTotal: {total} \nFalse Positive: {percent_fp} \nFalse Negative: {percent_fn}')

Confusion Matrix trained on no race for Other: 
Total: 81 
False Positive: 0.07407407407407407 
False Negative: 0.1728395061728395


## Confusion Matrix Loc by Asian - Trained and Tested on no Race


In [131]:
asian = joinedDF_no_na.loc[joinedDF_no_na['race'] == 'Asian']
# caucasian.head()
asian_reduced = asian[['race','Prediction','Actual']]
asian_reduced.head()

Unnamed: 0,race,Prediction,Actual
486,Asian,0.0,1.0
1586,Asian,0.0,0.0
1860,Asian,0.0,1.0
2034,Asian,0.0,1.0
3752,Asian,1.0,0.0


In [132]:
from sklearn.metrics import confusion_matrix
y_true_asian = asian_reduced['Actual']
y_pred_asian = asian_reduced['Prediction']
confusion_matrix(y_true_asian, y_pred_asian)

array([[2, 2],
       [4, 2]], dtype=int64)

In [133]:
tn, fp, fn, tp = confusion_matrix(y_true_asian, y_pred_asian).ravel()
(tn, fp, fn, tp)

(2, 2, 4, 2)

In [134]:
total = tn + fp + fn + tp 
percent_fp = fp / total 
percent_fn = fn / total 
print(f'Confusion Matrix trained on no race for Asian: \nTotal: {total} \nFalse Positive: {percent_fp} \nFalse Negative: {percent_fn}')

Confusion Matrix trained on no race for Asian: 
Total: 10 
False Positive: 0.2 
False Negative: 0.4


# Wednesday Work End 

## Train on Entire Data, Test on Specific Races

# White

In [None]:
caucasian = df.loc[df['race'] == 'Caucasian']
caucasian_reduced = caucasian[['sex', 'age', 'age_cat', 'juve_fel_count', 'priors_count', 'violence', 'drugs', 'theft', 'arrest_case_no_charge', 'traffic', 'disorderly_conduct', 'sexual_misconduct','two_year_recid','juv_misd_count', 'prios_count']]
caucasian_DF = pd.get_dummies(caucasian_reduced)
X_test_caucasian = caucasian_DF[['juve_fel_count', 'priors_count', 'violence', 'drugs', 'theft', 'arrest_case_no_charge', 'traffic', 'disorderly_conduct', 'sexual_misconduct', 'sex_Female', 'sex_Male', 'age_cat_25 - 45', 'age_cat_Greater than 45', 'age_cat_Less than 25', 'juv_misd_count', 'prios_count']]
y_test_caucasian = caucasian_DF['two_year_recid']

In [None]:
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(n_estimators=8000)
rf = rf.fit(X, y)
rf.score(X_test_caucasian, y_test_caucasian)

In [None]:
feature_names = X.columns
sorted(zip(rf.feature_importances_, feature_names), reverse=True)

# African-American

In [None]:
african_american = df.loc[df['race'] == 'African-American']
african_american_reduced = african_american[['sex', 'age', 'age_cat', 'juve_fel_count', 'priors_count', 'violence', 'drugs', 'theft', 'arrest_case_no_charge', 'traffic', 'disorderly_conduct', 'sexual_misconduct','two_year_recid','juv_misd_count', 'prios_count']]
african_american_DF = pd.get_dummies(african_american_reduced)
X_test_aa = african_american_DF[['juve_fel_count', 'priors_count', 'violence', 'drugs', 'theft', 'arrest_case_no_charge', 'traffic', 'disorderly_conduct', 'sexual_misconduct', 'sex_Female', 'sex_Male', 'age_cat_25 - 45', 'age_cat_Greater than 45', 'age_cat_Less than 25', 'juv_misd_count', 'prios_count']]
y_test_aa = african_american_DF['two_year_recid']

In [None]:
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(n_estimators=8000)
rf = rf.fit(X, y)
rf.score(X_test_aa, y_test_aa)

In [None]:
feature_names = X.columns
sorted(zip(rf.feature_importances_, feature_names), reverse=True)

# Asian

In [None]:
asian = df.loc[df['race'] == 'Asian']
asian_reduced = asian[['sex', 'age', 'age_cat', 'juve_fel_count', 'priors_count', 'violence', 'drugs', 'theft', 'arrest_case_no_charge', 'traffic', 'disorderly_conduct', 'sexual_misconduct','two_year_recid','juv_misd_count', 'prios_count']]
asian_DF = pd.get_dummies(asian_reduced)
X_test_asian = asian_DF[['juve_fel_count', 'priors_count', 'violence', 'drugs', 'theft', 'arrest_case_no_charge', 'traffic', 'disorderly_conduct', 'sexual_misconduct', 'sex_Female', 'sex_Male', 'age_cat_25 - 45', 'age_cat_Greater than 45', 'age_cat_Less than 25', 'juv_misd_count', 'prios_count']]
y_test_asian = asian_DF['two_year_recid']

In [None]:
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(n_estimators=8000)
rf = rf.fit(X, y)
rf.score(X_test_asian, y_test_asian)

In [None]:
feature_names = X.columns
sorted(zip(rf.feature_importances_, feature_names), reverse=True)

# Hispanic

In [None]:
hispanic = df.loc[df['race'] == 'Hispanic']
hispanic_reduced = hispanic[['sex', 'age', 'age_cat', 'juve_fel_count', 'priors_count', 'violence', 'drugs', 'theft', 'arrest_case_no_charge', 'traffic', 'disorderly_conduct', 'sexual_misconduct','two_year_recid','juv_misd_count', 'prios_count']]
hispanic_DF = pd.get_dummies(hispanic_reduced)
X_test_hispanic = hispanic_DF[['juve_fel_count', 'priors_count', 'violence', 'drugs', 'theft', 'arrest_case_no_charge', 'traffic', 'disorderly_conduct', 'sexual_misconduct', 'sex_Female', 'sex_Male', 'age_cat_25 - 45', 'age_cat_Greater than 45', 'age_cat_Less than 25', 'juv_misd_count', 'prios_count']]
y_test_hispanic = hispanic_DF['two_year_recid']

In [None]:
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(n_estimators=8000)
rf = rf.fit(X, y)
rf.score(X_test_hispanic, y_test_hispanic)

In [None]:
feature_names = X.columns
sorted(zip(rf.feature_importances_, feature_names), reverse=True)

# Native American

In [None]:
native_american = df.loc[df['race'] == 'Native American']
native_american_reduced = native_american[['sex', 'age', 'age_cat', 'juve_fel_count', 'priors_count', 'violence', 'drugs', 'theft', 'arrest_case_no_charge', 'traffic', 'disorderly_conduct', 'sexual_misconduct','two_year_recid','juv_misd_count', 'prios_count']]
native_american_DF = pd.get_dummies(native_american_reduced)
X_test_native_american = native_american_DF[['juve_fel_count', 'priors_count', 'violence', 'drugs', 'theft', 'arrest_case_no_charge', 'traffic', 'disorderly_conduct', 'sexual_misconduct', 'sex_Female', 'sex_Male', 'age_cat_25 - 45', 'age_cat_Greater than 45', 'age_cat_Less than 25', 'juv_misd_count', 'prios_count']]
y_test_native_american = native_american_DF['two_year_recid']

In [None]:
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(n_estimators=8000)
rf = rf.fit(X, y)
rf.score(X_test_native_american, y_test_native_american)

In [None]:
feature_names = X.columns
sorted(zip(rf.feature_importances_, feature_names), reverse=True)

# Other

In [None]:
other = df.loc[df['race'] == 'Other']
other_reduced = other[['sex', 'age', 'age_cat', 'juve_fel_count', 'priors_count', 'violence', 'drugs', 'theft', 'arrest_case_no_charge', 'traffic', 'disorderly_conduct', 'sexual_misconduct','two_year_recid','juv_misd_count', 'prios_count']]
other_DF = pd.get_dummies(other_reduced)
X_test_other = other_DF[['juve_fel_count', 'priors_count', 'violence', 'drugs', 'theft', 'arrest_case_no_charge', 'traffic', 'disorderly_conduct', 'sexual_misconduct', 'sex_Female', 'sex_Male', 'age_cat_25 - 45', 'age_cat_Greater than 45', 'age_cat_Less than 25', 'juv_misd_count', 'prios_count']]
y_test_other = other_DF['two_year_recid']

In [None]:
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(n_estimators=8000)
rf = rf.fit(X, y)
rf.score(X_test_other, y_test_other)

In [None]:
feature_names = X.columns
sorted(zip(rf.feature_importances_, feature_names), reverse=True)

# Adding Race In 

In [62]:
reducedDF_with_race_dummies = pd.get_dummies(reducedDF)
reducedDF_with_race_dummies.head()

Unnamed: 0,age,juve_fel_count,juv_misd_count,prios_count,violence,drugs,theft,arrest_case_no_charge,traffic,disorderly_conduct,...,two_year_recid,age_cat_25 - 45,age_cat_Greater than 45,age_cat_Less than 25,race_African-American,race_Asian,race_Caucasian,race_Hispanic,race_Native American,race_Other
0,69,0,0,0,7,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,1
1,34,0,0,0,7,0,0,0,0,0,...,1,1,0,0,1,0,0,0,0,0
2,24,0,0,4,0,6,0,0,0,0,...,1,0,0,1,1,0,0,0,0,0
3,23,0,1,1,0,6,0,0,0,0,...,0,0,0,1,1,0,0,0,0,0
4,43,0,0,2,0,0,0,4,0,0,...,0,1,0,0,0,0,0,0,0,1


In [63]:
X_race = reducedDF_with_race_dummies[['juve_fel_count', 'priors_count', 'violence', 'drugs', 'theft', 'arrest_case_no_charge', 'traffic', 'disorderly_conduct', 'sexual_misconduct', 'age_cat_25 - 45', 'age_cat_Greater than 45', 'age_cat_Less than 25', 'juv_misd_count', 'prios_count', 'race_African-American', 'race_Asian', 'race_Caucasian', 'race_Hispanic', 'race_Native American', 'race_Other']]
y_race = reducedDF_with_race_dummies['two_year_recid']

## Train ML on all data with race, now test on race 

In [64]:
from sklearn.model_selection import train_test_split
from sklearn import tree
X_train, X_test, y_train, y_test = train_test_split(X_race, y_race, random_state=42)

In [65]:
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(n_estimators=1000)
rf = rf.fit(X_train, y_train)
rf.score(X_test, y_test)

0.6494156928213689

In [66]:
print(f"Training Data Score: {rf.score(X_train, y_train)}")
print(f"Testing Data Score: {rf.score(X_test, y_test)}")

Training Data Score: 0.7412769116555308
Testing Data Score: 0.6494156928213689


In [67]:
feature_names = X_race.columns
sorted(zip(rf.feature_importances_, feature_names), reverse=True)

[(0.24888582126638265, 'prios_count'),
 (0.2423927038050171, 'priors_count'),
 (0.055484344784689325, 'juv_misd_count'),
 (0.050867760625512896, 'age_cat_Less than 25'),
 (0.04156427546172879, 'age_cat_Greater than 45'),
 (0.04113338747627199, 'juve_fel_count'),
 (0.03900310639610396, 'race_African-American'),
 (0.03166108899709337, 'drugs'),
 (0.031060604950868678, 'theft'),
 (0.03090203589422855, 'race_Caucasian'),
 (0.030115217144342565, 'age_cat_25 - 45'),
 (0.02944902037564652, 'arrest_case_no_charge'),
 (0.027050865229310623, 'violence'),
 (0.024658553056012687, 'traffic'),
 (0.024082346198354154, 'disorderly_conduct'),
 (0.021839079233632847, 'race_Hispanic'),
 (0.01738314367574995, 'race_Other'),
 (0.007111486941659801, 'sexual_misconduct'),
 (0.0032277047534603697, 'race_Asian'),
 (0.0021274537339333027, 'race_Native American')]

In [68]:
predictions = rf.predict(X_test)

In [69]:
X_test['Prediction'] = predictions
X_test['Actual'] = y_test


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


## Separate DF into Only Races / Prediction / Actual 

In [70]:
X_test_see_race = X_test[['race_African-American', 'race_Asian', 'race_Caucasian', 'race_Hispanic', 'race_Native American', 'race_Other', 'Prediction', 'Actual']]
X_test_see_race.head()

Unnamed: 0,race_African-American,race_Asian,race_Caucasian,race_Hispanic,race_Native American,race_Other,Prediction,Actual
1315,1,0,0,0,0,0,0,1
1893,1,0,0,0,0,0,0,1
6937,0,0,0,0,0,1,1,0
7021,0,0,1,0,0,0,0,0
4908,0,0,1,0,0,0,1,1


## Test Confusion Matrix on AA

In [71]:
X_test_see_race_aa = X_test_see_race.loc[X_test_see_race['race_African-American'] == 1]
X_test_see_race_aa.head()

Unnamed: 0,race_African-American,race_Asian,race_Caucasian,race_Hispanic,race_Native American,race_Other,Prediction,Actual
1315,1,0,0,0,0,0,0,1
1893,1,0,0,0,0,0,0,1
1894,1,0,0,0,0,0,1,1
800,1,0,0,0,0,0,0,0
2845,1,0,0,0,0,0,0,1


In [72]:
#print(f'Number of Predicted: {len(X_test_see_race_aa.loc[X_test_see_race_aa["Prediction"]==True])}, Number of Actual:{len(X_test_see_race_aa.loc[X_test_see_race_aa["Actual"]==True])}, {len(X_test_see_race_aa.loc[X_test_see_race_aa["Prediction"]==True])/len(X_test_see_race_aa["Actual"])}')


In [73]:
from sklearn.metrics import confusion_matrix
y_true = X_test_see_race_aa['Actual']
y_pred = X_test_see_race_aa['Prediction']
confusion_matrix(y_true, y_pred)

array([[322, 149],
       [186, 285]], dtype=int64)

In [74]:
# ### sklearn.metrics.confusion_matrix(y_true, y_pred, labels=None, sample_weight=None)[source]¶
# ## TEST
# tn, fp, fn, tp = confusion_matrix([0, 1, 0, 1], [1, 1, 1, 0]).ravel()
# (tn, fp, fn, tp)

In [75]:
tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel()
(tn, fp, fn, tp)


(322, 149, 186, 285)

In [76]:
total = tn + fp + fn + tp 
percent_fp = fp / total 
percent_fn = fn / total 
print(f'Confusion Matrix for African-American: \nTotal: {total} \nFalse Positive: {percent_fp} \nFalse Negative: {percent_fn}')

Confusion Matrix for African-American: 
Total: 942 
False Positive: 0.15817409766454352 
False Negative: 0.19745222929936307


## Confusion Matrix White

In [77]:
X_test_see_race_white = X_test_see_race.loc[X_test_see_race['race_Caucasian'] == 1]
X_test_see_race_white.head()

Unnamed: 0,race_African-American,race_Asian,race_Caucasian,race_Hispanic,race_Native American,race_Other,Prediction,Actual
7021,0,0,1,0,0,0,0,0
4908,0,0,1,0,0,0,1,1
6107,0,0,1,0,0,0,0,0
3066,0,0,1,0,0,0,0,0
5106,0,0,1,0,0,0,0,0


In [78]:
from sklearn.metrics import confusion_matrix
y_true = X_test_see_race_white['Actual']
y_pred = X_test_see_race_white['Prediction']
confusion_matrix(y_true, y_pred)

array([[304,  62],
       [151,  94]], dtype=int64)

In [79]:
## proves that our machine learning isnt doing anything more ethical 

In [80]:
tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel()
(tn, fp, fn, tp)

(304, 62, 151, 94)

In [81]:
total = tn + fp + fn + tp 
percent_fp = fp / total 
percent_fn = fn / total 
print(f'Confusion Matrix for Caucasian: \nTotal: {total} \nFalse Positive: {percent_fp} \nFalse Negative: {percent_fn}')

Confusion Matrix for Caucasian: 
Total: 611 
False Positive: 0.10147299509001637 
False Negative: 0.24713584288052373


## Confusion Matrix Hispanic

In [82]:
X_test_see_race_hispanic = X_test_see_race.loc[X_test_see_race['race_Hispanic'] == 1]
X_test_see_race_hispanic.head()

Unnamed: 0,race_African-American,race_Asian,race_Caucasian,race_Hispanic,race_Native American,race_Other,Prediction,Actual
4959,0,0,0,1,0,0,0,0
2846,0,0,0,1,0,0,0,1
5015,0,0,0,1,0,0,0,0
4684,0,0,0,1,0,0,0,0
239,0,0,0,1,0,0,0,0


In [83]:
from sklearn.metrics import confusion_matrix
y_true = X_test_see_race_hispanic['Actual']
y_pred = X_test_see_race_hispanic['Prediction']
confusion_matrix(y_true, y_pred)

array([[77, 12],
       [41, 18]], dtype=int64)

In [84]:
tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel()
(tn, fp, fn, tp)

(77, 12, 41, 18)

In [85]:
total = tn + fp + fn + tp 
percent_fp = fp / total 
percent_fn = fn / total 
print(f'Confusion Matrix for Hispanic: \nTotal: {total} \nFalse Positive: {percent_fp} \nFalse Negative: {percent_fn}')

Confusion Matrix for Hispanic: 
Total: 148 
False Positive: 0.08108108108108109 
False Negative: 0.27702702702702703


## Confusion Matrix Native American 

In [86]:
X_test_see_race_native_american = X_test_see_race.loc[X_test_see_race['race_Native American'] == 1]
X_test_see_race_native_american.head()

Unnamed: 0,race_African-American,race_Asian,race_Caucasian,race_Hispanic,race_Native American,race_Other,Prediction,Actual
457,0,0,0,0,1,0,1,1
1163,0,0,0,0,1,0,1,0
1858,0,0,0,0,1,0,0,0
1703,0,0,0,0,1,0,0,0
3639,0,0,0,0,1,0,1,1


In [87]:
from sklearn.metrics import confusion_matrix
y_true = X_test_see_race_native_american['Actual']
y_pred = X_test_see_race_native_american['Prediction']
confusion_matrix(y_true, y_pred)

array([[2, 1],
       [0, 2]], dtype=int64)

In [88]:
tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel()
(tn, fp, fn, tp)

(2, 1, 0, 2)

In [89]:
total = tn + fp + fn + tp 
percent_fp = fp / total 
percent_fn = fn / total 
print(f'Confusion Matrix for Native American: \nTotal: {total} \nFalse Positive: {percent_fp} \nFalse Negative: {percent_fn}')

Confusion Matrix for Native American: 
Total: 5 
False Positive: 0.2 
False Negative: 0.0


## Confusion Matrix Other

In [90]:
X_test_see_race_other = X_test_see_race.loc[X_test_see_race['race_Other'] == 1]
X_test_see_race_other.head()

Unnamed: 0,race_African-American,race_Asian,race_Caucasian,race_Hispanic,race_Native American,race_Other,Prediction,Actual
6937,0,0,0,0,0,1,1,0
5849,0,0,0,0,0,1,0,0
6782,0,0,0,0,0,1,0,0
4647,0,0,0,0,0,1,1,0
2392,0,0,0,0,0,1,0,0


In [91]:
from sklearn.metrics import confusion_matrix
y_true = X_test_see_race_other['Actual']
y_pred = X_test_see_race_other['Prediction']
confusion_matrix(y_true, y_pred)

array([[44, 10],
       [13, 14]], dtype=int64)

In [92]:
tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel()
(tn, fp, fn, tp)

(44, 10, 13, 14)

In [93]:
total = tn + fp + fn + tp 
percent_fp = fp / total 
percent_fn = fn / total 
print(f'Confusion Matrix for Other: \nTotal: {total} \nFalse Positive: {percent_fp} \nFalse Negative: {percent_fn}')

Confusion Matrix for Other: 
Total: 81 
False Positive: 0.12345679012345678 
False Negative: 0.16049382716049382


## Confusion Matrix Asian

In [94]:
X_test_see_race_other = X_test_see_race.loc[X_test_see_race['race_Asian'] == 1]
X_test_see_race_other.head()

Unnamed: 0,race_African-American,race_Asian,race_Caucasian,race_Hispanic,race_Native American,race_Other,Prediction,Actual
2034,0,1,0,0,0,0,0,1
5746,0,1,0,0,0,0,0,0
1860,0,1,0,0,0,0,0,1
1586,0,1,0,0,0,0,0,0
5364,0,1,0,0,0,0,1,0


In [95]:
from sklearn.metrics import confusion_matrix
y_true = X_test_see_race_other['Actual']
y_pred = X_test_see_race_other['Prediction']
confusion_matrix(y_true, y_pred)

array([[2, 2],
       [3, 3]], dtype=int64)

In [96]:
tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel()
(tn, fp, fn, tp)

(2, 2, 3, 3)

In [97]:
total = tn + fp + fn + tp 
percent_fp = fp / total 
percent_fn = fn / total 
print(f'Confusion Matrix for Asian: \nTotal: {total} \nFalse Positive: {percent_fp} \nFalse Negative: {percent_fn}')

Confusion Matrix for Asian: 
Total: 10 
False Positive: 0.2 
False Negative: 0.3


## Dont read - dont make sense 

# White 

In [None]:
caucasian = df.loc[df['race'] == 'Caucasian']
caucasian_reduced = caucasian[['sex', 'age', 'age_cat', 'juve_fel_count', 'priors_count', 'violence', 'drugs', 'theft', 'arrest_case_no_charge', 'traffic', 'disorderly_conduct', 'sexual_misconduct','two_year_recid','juv_misd_count', 'prios_count']]
caucasian_DF = pd.get_dummies(caucasian_reduced)
X_test_caucasian = caucasian_DF[['juve_fel_count', 'priors_count', 'violence', 'drugs', 'theft', 'arrest_case_no_charge', 'traffic', 'disorderly_conduct', 'sexual_misconduct', 'sex_Female', 'sex_Male', 'age_cat_25 - 45', 'age_cat_Greater than 45', 'age_cat_Less than 25', 'juv_misd_count', 'prios_count']]
y_test_caucasian = caucasian_DF['two_year_recid']

In [None]:
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(n_estimators=8000)
rf = rf.fit(X_race, y_race)
rf.score(X_test_caucasian, y_test_caucasian)

In [None]:
#pd.DataFrame({"Person": X_test.values.reshape(-1, 1), "Prediction": predictions,  "Actual": y_test}).reset_index(drop=True)