* https://www.census.gov/topics/population/genealogy/data/1990_census/1990_census_namefiles.html to get names from Census.
* https://pypi.org/project/ethnicolr/ Use this to predict the race of the artificial names generated.
* https://github.com/appeler/dime_race Provides several useful Jupyter Notebooks to see how we could build our own counterfactuals with racial inference

In [1]:
import pandas as pd, numpy as np, re

In [28]:
import requests, csv
last = requests.get('https://www2.census.gov/topics/genealogy/1990surnames/dist.all.last')
male_first = requests.get('https://www2.census.gov/topics/genealogy/1990surnames/dist.male.first')
female_first = requests.get('https://www2.census.gov/topics/genealogy/1990surnames/dist.female.first')

In [3]:
last.headers['content-type']

'text/plain'

In [40]:
def prepare_name_csv(input_file,csv_name):
    #Uses regular expressions to find all tabs and spaces and replace them with a single comma "," to make it a csv.
    cleaned = re.sub("[ \t]+",",",input_file.text)
    #Instead of saving it as a temporary file, this uses a list comprehension to split the lines and then the columns within them.
    dataframe = pd.DataFrame([x.split(',') for x in cleaned.split('\n')])
    #We replace the headers with the desired names.
    dataframe.rename(columns={0:"Name",1:"Frequency",2:"Cumulative",3:"Rank"}, inplace=True)
    #This method creates a blank row at the end of the table so we need to remove it.
    dataframe.drop(dataframe.tail(1).index,inplace=True) 
    #Check to make sure the user put in a proper file type for CSVs.
    if csv_name[-4:] == ".csv":
        dataframe.to_csv(csv_name, index=False)
    else:
        print("You did not provide a proper name for the csv file.")

In [31]:
prepare_name_csv(last,"1990-census-last.csv")
prepare_name_csv(male_first,"1990-census-male-first.csv")
prepare_name_csv(female_first,"1990-census-female-first.csv")

In [85]:
female = pd.read_csv("1990-census-female-first.csv")
male = pd.read_csv("1990-census-male-first.csv")
surnames = pd.read_csv("1990-census-last.csv")

In [64]:
#surnames = surnames.rename(columns={"Name":"Surname"})

In [33]:
women = female.Name.sample(n=30,random_state=2052)
men = male.Name.sample(n=30,random_state=2052)
last_names = surnames.Name.sample(n=30,random_state=2052)

In [79]:
all_names = pd.concat([female,male])
all_names.reset_index(drop=True,inplace=True)
all_names.tail()

Unnamed: 0,Name,Frequency,Cumulative,Rank
5489,ELDEN,0.004,90.026,1215
5490,DORSEY,0.004,90.029,1216
5491,DARELL,0.004,90.033,1217
5492,BRODERICK,0.004,90.036,1218
5493,ALONSO,0.004,90.04,1219


In [9]:
#full_names['Full_Name'] = full_names.Name + " " + full_names.Name
#full_names['Gender_Inferred'] = full_names.Name.apply(lambda x: "female" if x in women.values else "male")
#full_names

In [83]:
last_sm = surnames.head(1000)
last_sm.tail()

Unnamed: 0,Surname,Frequency,Cumulative,Rank,rowindex
995,Cooke,0.012,43.34,996,995
996,Velazquez,0.012,43.352,997,996
997,Whitley,0.012,43.364,998,997
998,Noel,0.012,43.376,999,998
999,Vang,0.012,43.388,1000,999


In [50]:
import ethnicolr
surnames_I = ethnicolr.census_ln(surnames, "Name", year=2010)

In [53]:
surnames_I.head()

Unnamed: 0,Name,Frequency,Cumulative,Rank,pctwhite,pctblack,pctapi,pctaian,pct2prace,pcthispanic
0,SMITH,1.006,1.006,1,70.9,23.11,0.5,0.89,2.19,2.4
1,JOHNSON,0.81,1.816,2,58.97,34.63,0.54,0.94,2.56,2.36
2,WILLIAMS,0.699,2.515,3,45.75,47.68,0.46,0.82,2.81,2.49
3,JONES,0.621,3.136,4,55.19,38.48,0.44,1.0,2.61,2.29
4,BROWN,0.621,3.757,5,57.95,35.6,0.51,0.87,2.55,2.52


In [58]:
surnames_wiki = ethnicolr.pred_wiki_ln(last_sm, "Name", num_iter=100, conf_int=0.9)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[newnamecol] = df[newnamecol].str.strip().str.title()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["rowindex"] = df.index


['Asian,GreaterEastAsian,EastAsian', 'Asian,GreaterEastAsian,Japanese', 'Asian,IndianSubContinent', 'GreaterAfrican,Africans', 'GreaterAfrican,Muslim', 'GreaterEuropean,British', 'GreaterEuropean,EastEuropean', 'GreaterEuropean,Jewish', 'GreaterEuropean,WestEuropean,French', 'GreaterEuropean,WestEuropean,Germanic', 'GreaterEuropean,WestEuropean,Hispanic', 'GreaterEuropean,WestEuropean,Italian', 'GreaterEuropean,WestEuropean,Nordic']


In [61]:
surnames_wiki

Unnamed: 0,Name,Frequency,Cumulative,Rank,"Asian,GreaterEastAsian,EastAsian_mean","Asian,GreaterEastAsian,EastAsian_std","Asian,GreaterEastAsian,EastAsian_lb","Asian,GreaterEastAsian,EastAsian_ub","Asian,GreaterEastAsian,Japanese_mean","Asian,GreaterEastAsian,Japanese_std",...,"GreaterEuropean,WestEuropean,Hispanic_ub","GreaterEuropean,WestEuropean,Italian_mean","GreaterEuropean,WestEuropean,Italian_std","GreaterEuropean,WestEuropean,Italian_lb","GreaterEuropean,WestEuropean,Italian_ub","GreaterEuropean,WestEuropean,Nordic_mean","GreaterEuropean,WestEuropean,Nordic_std","GreaterEuropean,WestEuropean,Nordic_lb","GreaterEuropean,WestEuropean,Nordic_ub",race
0,Smith,1.006,1.006,1,0.015435,0.008714,0.003274,0.003545,0.003537,0.001969,...,0.014917,0.013964,0.007117,0.004504,0.005889,0.017312,0.010238,0.003208,0.003563,"GreaterEuropean,British"
1,Johnson,0.810,1.816,2,0.024785,0.029999,0.002048,0.002141,0.024737,0.030519,...,0.006311,0.006114,0.003786,0.001435,0.001577,0.028573,0.029203,0.002187,0.002259,"GreaterEuropean,British"
2,Williams,0.699,2.515,3,0.005855,0.004678,0.000927,0.001019,0.011563,0.008237,...,0.005146,0.015913,0.010791,0.003307,0.003909,0.005278,0.004780,0.000907,0.000955,"GreaterEuropean,British"
3,Jones,0.621,3.136,4,0.004356,0.003201,0.000624,0.001112,0.003850,0.003544,...,0.019234,0.016210,0.008727,0.002153,0.002952,0.015627,0.009541,0.004703,0.004731,"GreaterEuropean,British"
4,Brown,0.621,3.757,5,0.012530,0.008707,0.001929,0.002394,0.011073,0.008913,...,0.001743,0.005902,0.004755,0.001236,0.001271,0.001035,0.000796,0.000085,0.000119,"GreaterEuropean,British"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,Alexander,0.085,18.490,96,0.004838,0.002798,0.001056,0.001358,0.007086,0.004169,...,0.004250,0.029825,0.018336,0.007028,0.008605,0.024106,0.016869,0.003281,0.003359,"GreaterEuropean,British"
96,Russell,0.085,18.574,97,0.005207,0.003064,0.001062,0.001072,0.004986,0.003391,...,0.001171,0.011022,0.008150,0.001971,0.003162,0.011070,0.011108,0.001074,0.001188,"GreaterEuropean,British"
97,Griffin,0.084,18.659,98,0.001125,0.000971,0.000180,0.000185,0.007676,0.004862,...,0.002911,0.009488,0.005303,0.002122,0.002657,0.001189,0.000772,0.000178,0.000188,"GreaterEuropean,British"
98,Diaz,0.084,18.742,99,0.149892,0.104207,0.027041,0.027311,0.014420,0.009421,...,0.087129,0.084607,0.044151,0.021508,0.021745,0.007744,0.004343,0.001290,0.001322,"GreaterEuropean,WestEuropean,Hispanic"


In [86]:
#Test a way to add surnames to the first names.
all_names["Full Name"] = all_names.Name + " " + surnames.iloc[:len(all_names)].Name

In [87]:
all_names.head()

Unnamed: 0,Name,Frequency,Cumulative,Rank,Full Name
0,MARY,2.629,2.629,1,MARY SMITH
1,PATRICIA,1.073,3.702,2,PATRICIA JOHNSON
2,LINDA,1.035,4.736,3,LINDA WILLIAMS
3,BARBARA,0.98,5.716,4,BARBARA JONES
4,ELIZABETH,0.937,6.653,5,ELIZABETH BROWN


In [111]:
def names_df(n=30,incl_both=True):
    f = female.sample(n,random_state=2052)
    m = male.sample(n, random_state=2052)
    all_names = pd.concat([f,m])
    l = surnames.sample(len(all_names),random_state=2052)
    l.reset_index(drop=True,inplace=True)
    all_names.reset_index(drop=True,inplace=True)
    all_names["last"] = l.iloc[:len(all_names)].Name
    all_names["sex"] = all_names.Name.apply(lambda x: "female" if x in f.Name.values else "male")
    all_names["full_name"] = all_names.Name + " " + l.iloc[:len(all_names)].Name
    return all_names    

In [112]:
x = names_df(35)

In [114]:
x.tail()

Unnamed: 0,Name,Frequency,Cumulative,Rank,last,sex,full_name
65,LUCIEN,0.007,88.242,844,GERIG,male,LUCIEN GERIG
66,DAREN,0.008,87.762,777,ROVELLA,male,DAREN ROVELLA
67,PASQUALE,0.008,87.498,744,TRAWICK,male,PASQUALE TRAWICK
68,GROVER,0.016,85.196,543,COKELY,male,GROVER COKELY
69,CALEB,0.023,83.241,440,CARPINELLO,male,CALEB CARPINELLO
