### NC Model Applied to FL Data

In [1]:
!pip install ethnicolr



In [2]:
import pandas as pd

In [3]:
fl_data = pd.read_csv('../data/fl_reg_name_race.csv.gz', compression='gzip', 
                                 header=0, sep=',', quotechar='"')

In [4]:
fl_data.head()

Unnamed: 0,name_last,name_first,race
0,Walker,Elizabeth,nh_white
1,Palmer,Alton,nh_white
2,Mc Cleod,Alicia,nh_black
3,Scarborough,Dale,nh_white
4,Walker,Daniel,nh_white


In [5]:
fl_data.rename(columns={'race': 'fl_race'}, inplace=True)
fl_data.head()

Unnamed: 0,name_last,name_first,fl_race
0,Walker,Elizabeth,nh_white
1,Palmer,Alton,nh_white
2,Mc Cleod,Alicia,nh_black
3,Scarborough,Dale,nh_white
4,Walker,Daniel,nh_white


In [6]:
fl_samp = fl_data.sample(n = 10000, random_state = 31415)

In [7]:
from ethnicolr import pred_nc_reg_name
preds = pred_nc_reg_name(fl_samp, lname_col = "name_last", fname_col = "name_first")

313/313 - 1s


In [8]:
preds

Unnamed: 0,name_last,name_first,fl_race,race,HL+A,HL+B,HL+I,HL+M,HL+O,HL+W,NL+A,NL+B,NL+I,NL+M,NL+O,NL+W
757560,Oldenburg,James,nh_white,NL+W,6.027722e-09,6.098673e-12,1.571013e-09,3.826774e-08,0.137427,0.299829,0.013250,0.049568,7.611386e-04,0.000270,0.171335,0.327559
7949174,BOLLINGER,VIRGINIA,nh_white,NL+I,1.605675e-06,9.508462e-08,1.755933e-07,1.269503e-10,0.000424,0.002651,0.000699,0.136859,7.947510e-01,0.000066,0.015746,0.048802
1952706,Blanco,Enrique,hispanic,HL+O,1.682701e-12,2.484461e-02,5.479284e-16,8.073048e-11,0.653203,0.210335,0.005093,0.008122,9.161754e-08,0.049161,0.038115,0.011126
5649972,McMurtry,Shea,nh_white,NL+B,6.887608e-12,2.754199e-08,9.820796e-08,3.417462e-06,0.002718,0.001321,0.000825,0.699702,7.514443e-03,0.014814,0.031897,0.241206
8036160,Yukus,John,nh_white,NL+A,9.330448e-11,9.119455e-11,7.943597e-10,5.297330e-07,0.012541,0.016837,0.565154,0.030571,9.481369e-07,0.003193,0.212010,0.159694
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2849522,Nelson,Steven,nh_black,HL+W,1.910448e-09,5.061355e-03,1.070835e-03,1.642348e-04,0.286262,0.348349,0.000416,0.053558,1.361813e-02,0.000196,0.131855,0.159450
4185395,Hill Hausman,Vicki,nh_white,NL+W,2.469423e-11,1.765721e-13,4.090911e-12,1.145996e-05,0.003438,0.121431,0.000836,0.119120,3.982964e-03,0.020424,0.054761,0.675997
11962715,Awtrey,Cynthia,nh_white,NL+W,1.671661e-07,5.599681e-13,3.660668e-09,4.818694e-04,0.021302,0.007244,0.000829,0.309827,3.536879e-02,0.001705,0.291285,0.331958
3667369,Garza,Maria,hispanic,HL+W,1.401596e-08,3.298952e-08,1.768033e-07,2.819802e-02,0.418315,0.431341,0.000156,0.000394,6.667362e-07,0.000007,0.118751,0.002837


In [9]:
pd.crosstab(preds['fl_race'], preds['race'])

race,HL+A,HL+B,HL+I,HL+M,HL+O,HL+W,NL+A,NL+B,NL+I,NL+M,NL+O,NL+W
fl_race,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
asian,0,0,1,4,8,6,76,14,5,6,32,13
hispanic,8,24,8,99,732,376,56,49,16,34,106,75
multi_racial,0,2,0,2,4,5,7,10,7,6,11,18
native_indian,0,0,0,1,1,1,1,11,1,2,2,10
nh_black,0,45,0,34,35,56,74,536,93,144,96,256
nh_white,4,86,7,181,309,394,322,1101,431,591,624,2325
other,0,3,0,2,27,8,17,22,10,15,28,30
unknown,1,6,2,7,37,28,20,38,7,17,21,60


In [10]:
preds['ethnic_code'] = preds['race'].str.split("+").str[0]
preds['race_code']  = preds['race'].str.split("+").str[1]

In [11]:
pd.crosstab(preds['fl_race'], preds['race_code'])

race_code,A,B,I,M,O,W
fl_race,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
asian,76,14,6,10,40,19
hispanic,64,73,24,133,838,451
multi_racial,7,12,7,8,15,23
native_indian,1,11,1,3,3,11
nh_black,74,581,93,178,131,312
nh_white,326,1187,438,772,933,2719
other,17,25,10,17,55,38
unknown,21,44,9,24,58,88


In [12]:
pd.crosstab(preds['fl_race'], preds['ethnic_code'])

ethnic_code,HL,NL
fl_race,Unnamed: 1_level_1,Unnamed: 2_level_1
asian,19,146
hispanic,1247,336
multi_racial,13,59
native_indian,3,27
nh_black,170,1199
nh_white,981,5394
other,40,122
unknown,81,163


### Comparison # 1:
Accuracy using Known-Known:

(race_code == 'B') & (ethnic_code == 'NL') ==> nh_black<br>
(race_code == 'W') & (ethnic_code == 'NL') ==> nh_white

In [13]:
preds['nc_race'] = preds['race']
preds.loc[(preds.race_code=='B') & (preds.ethnic_code=='NL'), 'nc_race'] = 'nh_black'
preds.loc[(preds.race_code=='W') & (preds.ethnic_code=='NL'), 'nc_race'] = 'nh_white'

comp_1 = preds[preds.fl_race.isin(["nh_black", "nh_white"])]
comp_1.shape

(7744, 19)

In [14]:
cf = pd.crosstab(comp_1['fl_race'], comp_1['nc_race'])
print(cf)

# Total
print("Total N", cf.values.sum())

# TP/Total 
print("NH Black:",(cf.values[0,10]/cf.values[0,].sum()).round(2))
print("NH White:", (cf.values[1,11]/cf.values[1,].sum()).round(2))
print("Accuracy:", ((cf.values[0,10] + cf.values[1,11])/cf.values.sum()).round(2))

nc_race   HL+A  HL+B  HL+I  HL+M  HL+O  HL+W  NL+A  NL+I  NL+M  NL+O  \
fl_race                                                                
nh_black     0    45     0    34    35    56    74    93   144    96   
nh_white     4    86     7   181   309   394   322   431   591   624   

nc_race   nh_black  nh_white  
fl_race                       
nh_black       536       256  
nh_white      1101      2325  
Total N 7744
NH Black: 0.39
NH White: 0.36
Accuracy: 0.37


### Comparison #2: Low FP

(race_code == 'B') & (ethnic_code == 'NL') ==> nh_black<br>
(race_code == 'W') & (ethnic_code == 'NL') ==> nh_white<br>
((race_code == 'W') & (ethnic_code == 'HL')) | ((race_code == 'B') & (ethnic_code == 'HL')) ==> hispanic<br>
(race_code == 'A') & (ethnic_code == 'NL') ==> asian

In [15]:
preds['nc_race_low_fp'] = preds.race
preds.loc[(preds.race_code=='B') & (preds.ethnic_code=='NL'), 'nc_race_low_fp'] = 'nh_black'
preds.loc[(preds.race_code=='W') & (preds.ethnic_code=='NL'), 'nc_race_low_fp'] = 'nh_white'
preds.loc[((preds.race_code == 'W') & (preds.ethnic_code == 'HL')) |
        ((preds.race_code == 'B') & (preds.ethnic_code == 'HL')), 'nc_race_low_fp'] = 'hispanic'
preds.loc[(preds.race_code == 'A') & (preds.ethnic_code == 'NL'), 'nc_race_low_fp'] = 'asian'
preds

comp_2 = preds[preds.fl_race.isin(["nh_black", "nh_white", "hispanic", "asian"])]
comp_2.shape

(9492, 20)

In [16]:
cf = pd.crosstab(comp_2['fl_race'], comp_2['nc_race_low_fp'])
print(cf)

# Total
print("Total N", cf.values.sum())

# TP/Total 
print("Asian:",(cf.values[0,7]/cf.values[0,].sum()).round(2))
print("Hispanic:",(cf.values[1,8]/cf.values[1,].sum()).round(2))
print("NH Black:",(cf.values[2,9]/cf.values[2,].sum()).round(2))
print("NH White:", (cf.values[3,10]/cf.values[3,].sum()).round(2))
print("Accuracy:", ((cf.values[0,7] + cf.values[1,8] + cf.values[2,9] + cf.values[3,10])/cf.values.sum()).round(2))

nc_race_low_fp  HL+A  HL+I  HL+M  HL+O  NL+I  NL+M  NL+O  asian  hispanic  \
fl_race                                                                     
asian              0     1     4     8     5     6    32     76         6   
hispanic           8     8    99   732    16    34   106     56       400   
nh_black           0     0    34    35    93   144    96     74       101   
nh_white           4     7   181   309   431   591   624    322       480   

nc_race_low_fp  nh_black  nh_white  
fl_race                             
asian                 14        13  
hispanic              49        75  
nh_black             536       256  
nh_white            1101      2325  
Total N 9492
Asian: 0.46
Hispanic: 0.25
NH Black: 0.39
NH White: 0.36
Accuracy: 0.35


### Comparison #3: Low FN
(race_code == 'B') & (ethnic_code == 'NL') ==> nh_black<br>
(race_code == 'W') & (ethnic_code == 'NL') ==> nh_white<br>
ethnic_code == 'HL' ==> hispanic<br>
(race_code == 'A') & (ethnic_code == 'NL') ==> asian

In [23]:
preds['nc_race_low_fn'] = preds.race
preds.loc[(preds.race_code=='B') & (preds.ethnic_code=='NL'), 'nc_race_low_fn'] = 'nh_black'
preds.loc[(preds.race_code=='W') & (preds.ethnic_code=='NL'), 'nc_race_low_fn'] = 'nh_white'
preds.loc[(preds.ethnic_code=='HL'), 'nc_race_low_fn'] = 'hispanic'
preds.loc[(preds.race_code=='A') & (preds.ethnic_code=='NL'), 'nc_race_low_fn'] = 'asian'
preds

comp_3 = preds[preds.fl_race.isin(["nh_black", "nh_white", "nc_hispanic", "nc_asian"])]
comp_3.shape

(7744, 21)

In [25]:
cf = pd.crosstab(comp_3['fl_race'], comp_3['nc_race_low_fn'])
print(cf)

# Total
print("Total N", cf.values.sum())
# TP/Total 
print("Asian:",(cf.values[0,7]/cf.values[0,].sum()).round(2))
print("Hispanic:",(cf.values[1,8]/cf.values[1,].sum()).round(2))
print("NH Black:",(cf.values[2,9]/cf.values[2,].sum()).round(2))
print("NH White:", (cf.values[3,10]/cf.values[3,].sum()).round(2))
print("Accuracy:", ((cf.values[0,7] + cf.values[1,8] + cf.values[2,9] + cf.values[3,10])/cf.values.sum()).round(2))

nc_race_low_fn  NL+I  NL+M  NL+O  asian  hispanic  nh_black  nh_white
fl_race                                                              
nh_black          93   144    96     74       170       536       256
nh_white         431   591   624    322       981      1101      2325
