### Importing Required Packages

In [1]:
import numpy as np
import pandas as pd
from nltk.classify import NaiveBayesClassifier
from nltk.classify import accuracy
from sklearn.utils import shuffle
import pickle
import re

### Reading Data

In [2]:
black_female = pd.read_csv("data/Black-Female-Names.csv")
black_male = pd.read_csv("data/Black-Male-Names.csv")
hispanic_female = pd.read_csv("data/Hispanic-Female-Names.csv")
hispanic_male = pd.read_csv("data/Hispanic-Male-Names.csv")
indian_female = pd.read_csv("data/Indian-Female-Names.csv")
indian_male = pd.read_csv("data/Indian-Male-Names.csv")
white_female = pd.read_csv("data/White-Female-Names.csv")
white_male = pd.read_csv("data/White-Male-Names.csv")

### Removing Nulls and removing white space from column *first name*

In [3]:
black_female = black_female.dropna()
black_male = black_male.dropna()
hispanic_female = hispanic_female.dropna()
hispanic_male = hispanic_male.dropna()
indian_female = indian_female.dropna()
indian_male = indian_male.dropna()
white_female = white_female.dropna()
white_male = white_male.dropna()

black_female.columns = ['last name', 'first name', 'gender', 'race']
black_male.columns = ['last name', 'first name', 'gender', 'race']
hispanic_female.columns = ['last name', 'first name', 'gender', 'race']
hispanic_male.columns = ['last name', 'first name', 'gender', 'race']
white_female.columns = ['last name', 'first name', 'gender', 'race']
white_male.columns = ['last name', 'first name', 'gender', 'race']

### Having a look at the data

In [4]:
black_female.head()

Unnamed: 0,last name,first name,gender,race
0,abraham,tashanika l,f,black
1,adams,denetra c,f,black
2,adams,tomesha d,f,black
3,adams,trellany c,f,black
4,adderley,cynthia s,f,black


In [5]:
black_male.head()

Unnamed: 0,last name,first name,gender,race
0,aaron,aaric a,m,black
1,aaron,dominique j,m,black
2,aaron,fredrick r,m,black
3,aaron,jarvis,m,black
4,aaron,lorenzo,m,black


In [6]:
hispanic_female.head()

Unnamed: 0,last name,first name,gender,race
0,adams,shirley m,f,hispanic
1,alonso,ana d,f,hispanic
2,alonzo,miriam,f,hispanic
3,alvarez,ivette,f,hispanic
4,amador,saray,f,hispanic


In [7]:
hispanic_male.head()

Unnamed: 0,last name,first name,gender,race
0,abad,jonathan r,m,hispanic
1,abellas-bauzo,jonathan a,m,hispanic
2,abreu,pablo,m,hispanic
3,abreu sosa,basilio l,m,hispanic
4,abreu-bolano,german,m,hispanic


In [8]:
indian_female.head()

Unnamed: 0,name,gender,race
0,shivani,f,indian
1,isha,f,indian
2,smt shyani devi,f,indian
3,divya,f,indian
4,mansi,f,indian


In [9]:
indian_male.head()

Unnamed: 0,name,gender,race
0,barjraj,m,indian
1,ramdin verma,m,indian
2,sharat chandran,m,indian
3,birender mandal,m,indian
4,amit,m,indian


In [10]:
white_female.head()

Unnamed: 0,last name,first name,gender,race
0,abbott,leslie b,f,white
1,abbott,peggy s,f,white
2,abernathy,rebecca r,f,white
3,abidin,antoinette,f,white
4,able,shaylene n,f,white


In [11]:
white_male.head()

Unnamed: 0,last name,first name,gender,race
0,aaron,billy r,m,white
1,aaron,charles d,m,white
2,aaron,clarence d,m,white
3,aaron,justin c,m,white
4,aaron,roy s,m,white


### Data Preprocessing

Cleaning up all characters which are not in english language and also removing everything which come after some suffix.<br>
example: name **shyani devi w/o shyam lal** is cleaned to **shyani devi**<br><br>
These kind of scenarios were encountered only in Indian names

In [12]:
def clean_data(data):
    data = re.sub('[^a-zA-Z&/ ]+', ' ', data)
    data = re.split("(d/0|d/o|d/|w/0|w/o|w/|s/o|s/0|s/|\(|with)", data)[0]
    data = data.strip()
    if len(data) > 0:
        return (data)
    else:
        return np.NAN

print(indian_female.shape)
indian_female['clean_name'] = indian_female['name'].apply(clean_data)
indian_female = indian_female.dropna()
print(indian_female.shape)

print(indian_male.shape)
indian_male['clean_name'] = indian_male['name'].apply(clean_data)
indian_male = indian_male.dropna()
print(indian_male.shape)

(15351, 3)
(15280, 4)
(14821, 3)
(14776, 4)


#### Removed salutations like Mr, Mrs, smt, km, Shri, etc from names

In [13]:
def remove_salutation(data):
    data = re.split("(smt\W|km\W|mr\W|shri\W|mrs\W|miss\W)", data, 1)
    if(len(data) > 1):
        return (data[2].strip())
    else:
        return(data[0].strip())
indian_female['clean_name'] = indian_female['clean_name'].apply(remove_salutation)
indian_male['clean_name'] = indian_male['clean_name'].apply(remove_salutation)

### Generating First Name and Last Name from cleaned names

In [14]:
def get_fname(data):
    data = data.split(' ')
    return data[0]

def get_lname(data):
    data = data.split(' ')
    if(len(data) > 1):
        return data[-1]

indian_female['first name'] = indian_female['clean_name'].apply(get_fname)
indian_female['last name'] = indian_female['clean_name'].apply(get_lname)
indian_male['first name'] = indian_male['clean_name'].apply(get_fname)
indian_male['last name'] = indian_male['clean_name'].apply(get_lname)

In [15]:
indian_female = indian_female[['last name', 'first name', 'gender', 'race']]
indian_male = indian_male[['last name', 'first name', 'gender', 'race']]
# display(indian_female.head())
# display(indian_male.head())

### Indian Preprocessing complete now merge dataset

In [16]:
full_dataset = black_female.append(black_male, ignore_index = True)
full_dataset = full_dataset.append(hispanic_female, ignore_index=True)
full_dataset = full_dataset.append(hispanic_male, ignore_index=True)
full_dataset = full_dataset.append(indian_female, ignore_index=True)
full_dataset = full_dataset.append(indian_male, ignore_index=True)
full_dataset = full_dataset.append(white_female, ignore_index=True)
full_dataset = full_dataset.append(white_male, ignore_index=True)

In [17]:
full_dataset.shape
full_dataset.describe()

Unnamed: 0,last name,first name,gender,race
count,103543,120603,120603,120603
unique,25052,36987,2,5
top,kumar,michael,m,white
freq,1495,485,98068,48645


In [18]:
full_dataset = shuffle(full_dataset)

# Extract features

In [19]:
def gender_features(name):
        """
        Traing and Testing feature sets
        """
        name = name.lower()
        name = name.strip()
        return ({
                'last_is_vowel': (name[-1] in 'AEIOUY'),
                'last_letter': name[-1],
                'last_three': name[-3:],
                'last_two': name[-2:],
                'first_letter': name[0]
            })

In [20]:
features_dataset = full_dataset['first name'].apply(gender_features)

### Generating dataframe for features vs label mapping

In [21]:
dataset = pd.DataFrame()
dataset['feat'] = features_dataset
dataset['gender'] = full_dataset['gender']

### Test Train Split

In [22]:
tr_data_to_use = list(dataset.itertuples(index=False))

partition = int(len(features_dataset) * 0.9) # data set partition
tr_nb = tr_data_to_use[:partition]
test_nb = tr_data_to_use[partition:]

## Training Naive Bayes

In [23]:
classifier = NaiveBayesClassifier.train(tr_nb)

### Checking accuracy

In [24]:
print("Classifier accuracy: {:0.2%}".format(accuracy(classifier, test_nb)))

Classifier accuracy: 91.40%


## Saving model to file

In [25]:
naiveBayes = open("naive_bayes_gender_classification_model.pkl","wb")
pickle.dump(classifier,naiveBayes)
naiveBayes.close()

# Test it

In [26]:
name = input('Enter your name here: ')
dist = classifier.prob_classify(gender_features(name))
m, f = dist.prob("m"), dist.prob("f")
d = {m: "male", f: "female"}
prob = max(m,f)
guess = d[prob]
print("%s is predicted as %s (%.2f%%)" % (name, guess, prob * 100))

Enter your name here: Vishal
Vishal is predicted as female (52.65%)
