# Name2gender dataset compilation

In [1]:
import csv
import string
import unicodedata
import codecs

In [2]:
# name-gen dict
name_gender = {}

# helper cleaning function (lowercase, unicode->ascii)
def clean(s):
    uncoded = ''.join(
        c for c in unicodedata.normalize('NFD', s)
        if unicodedata.category(c) != 'Mn'
        and c in string.ascii_letters
    )
    return uncoded.lower()

In [3]:
def to_csv(data, filename):
    with open(filename, 'w', newline='') as out:
        writer = csv.writer(out, delimiter=',')
        for k,v in data:
            writer.writerow([k,v])
            print(k,v)

## Add NLTK names corpus
We create the basic dataset from NLTK's male & female name corpora

In [4]:
import nltk
from nltk.corpus import names

# download names corpus
nltk.download("names")

# add nltk names
male_names = names.words('male.txt')
female_names = names.words('female.txt')
labeled_names = ([(name, 'male') for name in male_names] + [(name, 'female') for name in female_names])

[nltk_data] Downloading package names to /home/elb/nltk_data...
[nltk_data]   Unzipping corpora/names.zip.


In [5]:
for name,gender in labeled_names:
    name = clean(name)
    name_gender[name] = gender
#     print(name)
len(name_gender)

7553

## Add @mbejda datasets
https://gist.github.com/mbejda
* Black-Female-Names.csv     (~2,400 black (African American) female names)
* Black-Male-Names.csv       (~50,000 black (African American) male names)
* White-Female-Names.csv     (~4,500 white (Caucasian) female names)
* White-Male-Names.csv       (~40,000 white (Caucasian) male names)
* Indian-Female-Names.csv    (~14,000 Indian female names)
* Indian-Male-Names.csv      (~14,000 Indian male)
* Hispanic-Female-Names.csv  (~200 hispanic female names)
* Hispanic-Male-Names.csv    (~4,000 hispanic male names)

In [6]:
mbejda_datasets = ["Black-Female-Names.csv", "Black-Male-Names.csv", 
                   "White-Female-Names.csv", "White-Male-Names.csv",
                   "Indian-Female-Names.csv", "Indian-Male-Names.csv",
                   "Hispanic-Female-Names.csv", "Hispanic-Male-Names.csv"]

def read_mbejda(filename):
    with codecs.open(filename, 'r', 'utf8') as file:
        reader = csv.reader(file, delimiter=",")
        lines = list(reader)
        headers = lines.pop(0) # last name, first name, gender, race
        for line in lines:
            name = clean(" ".join([t for t in line[1].strip().split(" ") if len(t) > 1])) # removes middle intial
            gender = "female" if line[2] == "f" else "male"
            if not name in name_gender:
                name_gender[name] = gender
#                 print(name, gender)
    len(dataset)

In [7]:
for dataset in mbejda_datasets:
    print("loading " + dataset + "...")
    read_mbejda(dataset)

loading Black-Female-Names.csv...
loading Black-Male-Names.csv...
loading White-Female-Names.csv...
loading White-Male-Names.csv...
loading Indian-Female-Names.csv...
loading Indian-Male-Names.csv...
loading Hispanic-Female-Names.csv...
loading Hispanic-Male-Names.csv...


In [8]:
len(name_gender)

17621

In [None]:
# remove empty string entry
name_gender.pop('', None)

'male'

In [None]:
to_csv(name_gender.items(), "nltk_mbejda.csv")

aamir male
aaron male
abbey female
abbie female
abbot male
abbott male
abby female
abdel male
abdul male
abdulkarim male
abdullah male
abe male
abel male
abelard male
abner male
abraham male
abram male
ace male
adair male
adam male
adams male
addie female
adger male
aditya male
adlai male
adnan male
adolf male
adolfo male
adolph male
adolphe male
adolpho male
adolphus male
adrian female
adrick male
adrien female
agamemnon male
aguinaldo male
aguste male
agustin male
aharon male
ahmad male
ahmed male
ahmet male
ajai male
ajay female
al male
alaa male
alain male
alan male
alasdair male
alastair male
albatros male
albert male
alberto male
albrecht male
alden male
aldis male
aldo male
aldric male
aldrich male
aldus male
aldwin male
alec male
aleck male
alejandro male
aleks male
aleksandrs male
alessandro male
alex female
alexander male
alexei male
alexis female
alf male
alfie female
alfonse male
alfonso male
alfonzo male
alford male
alfred male
alfredo male
algernon male
ali female
alic ma

geri female
germaine female
gerold male
gerome male
gerrard male
gerri female
gerrit male
gerry female
gershom male
gershon male
giacomo male
gian male
giancarlo male
giavani male
gibb male
gideon male
giff male
giffard male
giffer male
giffie male
gifford male
giffy male
gil male
gilbert male
gilberto male
gilburt male
giles male
gill female
gilles male
ginger female
gino male
giordano male
giorgi male
giorgio male
giovanne male
giovanni male
giraldo male
giraud male
giuseppe male
glen female
glenn female
glynn male
godard male
godart male
goddard male
goddart male
godfree male
godfrey male
godfry male
godwin male
gomer male
gonzales male
gonzalo male
goober male
goose male
gordan male
gordie male
gordon male
grace female
grady male
graehme male
graeme male
graham male
graig male
grant male
granville male
greg male
gregg male
greggory male
gregor male
gregorio male
gregory male
gretchen female
griff male
griffin male
griffith male
griswold male
grove male
grover male
guido male
guilla

roberto male
robin female
robinson male
rochester male
rock male
rockwell male
rocky male
rod male
rodd male
roddie male
roddy male
roderic male
roderich male
roderick male
roderigo male
rodge male
rodger male
rodney male
rodolfo male
rodolph male
rodolphe male
rodrick male
rodrigo male
rodrique male
rog male
roger male
rogers male
roice male
roland male
rolando male
rolf male
rolfe male
rolland male
rollin male
rollins male
rollo male
rolph male
romain male
roman male
romeo male
ron male
ronald male
ronen male
roni male
ronnie female
ronny female
roosevelt male
rory female
roscoe male
ross male
roth male
rourke male
rowland male
roy male
royal male
royce male
rube male
ruben male
rubin male
ruby female
rudd male
ruddie male
ruddy male
rudie male
rudiger male
rudolf male
rudolfo male
rudolph male
rudy male
rudyard male
rufe male
rufus male
rupert male
ruperto male
russ male
russel male
russell male
rustie male
rustin male
rusty male
rutger male
rutherford male
rutledge male
rutter male

blinni female
blinnie female
blinny female
bliss female
blisse female
blithe female
blondell female
blondelle female
blondie female
blondy female
blythe female
bobbette female
bobbi female
bobette female
bobina female
bobine female
bobinette female
bonita female
bonnee female
bonni female
bonnie female
bonny female
brana female
brandais female
brande female
brandea female
brandi female
brandice female
brandie female
brandise female
brea female
breanne female
brear female
bree female
breena female
bren female
brena female
brenda female
brenn female
brenna female
bria female
briana female
brianna female
brianne female
bride female
bridget female
bridgett female
bridgette female
bridie female
brier female
brietta female
brigid female
brigida female
brigit female
brigitta female
brigitte female
brina female
briney female
briny female
brit female
brita female
britaney female
britani female
briteny female
britney female
britni female
britta female
brittan female
brittany female
britte female

## Add blk database

In [None]:
with codecs.open('blk_names.csv', 'r', 'utf8') as blkfile:
    reader = csv.reader(blkfile)
    for row in reader:
        name = clean(row[0])
        gender = "female" if clean(''.join(row[-1])) == 'f' else "male"
        if not name in name_gender:
            name_gender[name] = gender
#             print(name, gender)
len(name_gender)

In [None]:
# remove empty string entry
name_gender.pop('', None)

In [None]:
len(name_gender)

In [None]:
to_csv(name_gender.items(), "nltk_mbejda_blk.csv")

## Add corpus from mfran

In [None]:
with codecs.open('mfran_names.csv', 'r', 'utf8') as file:
    reader = csv.reader(file, delimiter=',')
    for row in reader:
        # in order to handle the few messed up records
        gender = clean(row[-1])
        name = clean(''.join(row[:-1]))
        if not name in name_gender:
            name_gender[name] = gender
#             print(name)
len(name_gender)

In [None]:
# remove empty string entry
name_gender.pop('', None)

## Final dataset  to CSV

In [None]:
import random
shuffled = list(name_gender.items())
random.shuffle(shuffled)

In [None]:
to_csv(shuffled,'name_gender_dataset.csv')