<a href="https://colab.research.google.com/github/ashavish/name-nationality/blob/master/name_nationality.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Experiments with models to derive nationality of Indian / Non-Indian from names

In [2]:
from tensorflow import keras
import tensorflow as tf
import pandas as pd
import os
import re

In [None]:
print(tf.__version__)

2.3.0


# Data Loading and Cleaning
Ref Data - https://www.kaggle.com/chaitanyapatil7/indian-names
<br>
Import Male and Female Indian Names


In [None]:
f_url = "https://raw.githubusercontent.com/ashavish/name-nationality/master/data/datasets_70812_149772_Indian-Female-Names.csv"
m_url = "https://raw.githubusercontent.com/ashavish/name-nationality/master/data/datasets_70812_149772_Indian-Male-Names.csv"

male_data = pd.read_csv(m_url)
female_data = pd.read_csv(f_url)

In [None]:
repl_list = ['s/o','d/o','w/o','/','&',',','-']

def clean_data(name):
	name = str(name).lower()
	name = (''.join(i for i in name if ord(i)<128)).strip()
	for repl in repl_list:
		name = name.replace(repl," ")
	if '@' in name:
		pos = name.find('@')
		name = name[:pos].strip()
	name = name.split(" ")
	name = " ".join([each.strip() for each in name])
	return name

def remove_records(merged_data):
	merged_data['delete'] = 0
	merged_data.loc[merged_data['name'].str.find('with') != -1,'delete'] = 1	
	merged_data.loc[merged_data['count_words']>=5,'delete']=1
	merged_data.loc[merged_data['count_words']==0,'delete']=1
	merged_data.loc[merged_data['name'].str.contains(r'\d') == True,'delete']=1
	cleaned_data = merged_data[merged_data.delete==0]
	return cleaned_data

merged_data = pd.concat((male_data,female_data),axis=0)

merged_data['name'] = merged_data['name'].apply(clean_data)
merged_data['count_words'] = merged_data['name'].str.split().apply(len)

cleaned_data = remove_records(merged_data)

indian_cleaned_data = cleaned_data[['name','count_words']].drop_duplicates(subset='name',keep='first')
indian_cleaned_data['label'] = 'indian'

len(indian_cleaned_data)

13754

In [None]:
indian_cleaned_data.head()

Unnamed: 0,name,count_words,label
0,barjraj,1,indian
1,ramdin verma,2,indian
2,sharat chandran,2,indian
3,birender mandal,2,indian
4,amit,1,indian


Lets create some non-Indian names using Faker - a pretty cool package to generate realistic names from different regions

In [None]:
#!pip3 install faker
from faker import Faker
import random
req = 15000
non_indian_names = []

langs = ['ar_EG','bs_BA','de_DE','dk_DK','en_AU','en_CA','en_GB',
'en_IN','en_NZ','en_US','it_IT','no_NO','ro_RO']

for i in range(0,req):
	lng_indx = random.randint(0,len(langs)-1)
	fake = Faker(langs[lng_indx])
	non_indian_names.append(fake.name().lower())

non_indian_names_orig = list(set(non_indian_names))


In [None]:
len(non_indian_names_orig)

14561

In [None]:
non_indian_data = pd.DataFrame({'name':non_indian_names_orig})
non_indian_data['count_words'] = non_indian_data['name'].str.split().apply(len)
non_indian_data.head()

Unnamed: 0,name,count_words
0,sara gulbrandsen,2
1,kathryn villarreal,2
2,jennifer mccormick,2
3,james eaton,2
4,melissa bond,2


Lets check the distribution of count of words in names. We dont want them to be too different

In [None]:
indian_cleaned_data['count_words'].value_counts()


2    7954
1    4322
3    1344
4     134
Name: count_words, dtype: int64

In [None]:
non_indian_data['count_words'].value_counts()

2    12785
3     1586
4      189
5        1
Name: count_words, dtype: int64

We dont see any one word names at all, so lets just get some first names

In [None]:
two_word_names = non_indian_data[non_indian_data['count_words']==2]['name']
one_word_req = 5000
names_one_two_words = [each.split()[0] for each in two_word_names[:one_word_req]] + list(two_word_names[one_word_req:])
count_words = [1] * one_word_req + [2] * len(two_word_names[one_word_req:])
not_two_words_pd  = non_indian_data[non_indian_data['count_words']!=2]
one_two_words_pd = pd.DataFrame({'name':names_one_two_words,'count_words':count_words})
non_indian_data = pd.concat((not_two_words_pd,one_two_words_pd),axis=0)
non_indian_data['count_words'].value_counts()
non_indian_data['label'] = 'non_indian'
non_indian_data = non_indian_data[non_indian_data['count_words']<5]
non_indian_data['count_words'].value_counts()

2    7785
1    5000
3    1586
4     189
Name: count_words, dtype: int64

In [None]:
full_data = pd.concat((non_indian_data[['name','label']],indian_cleaned_data[['name','label']]),axis=0)
full_data = full_data.sample(frac=1)

full_data.to_csv("name_data.csv",index=False)

from google.colab import files
files.download('name_data.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

# Get processed data files and split into train and test

In [3]:
data_url = "https://raw.githubusercontent.com/ashavish/name-nationality/master/data/name_data.csv"
name_data = pd.read_csv(data_url)

In [None]:
name_data.head()

Unnamed: 0,name,label
0,tufail ahamd,indian
1,md savir,indian
2,tahira bibi,indian
3,shambu dayal,indian
4,prof. harro niemeier,non_indian


In [None]:
name_data['label'].value_counts()

non_indian    14560
indian        13754
Name: label, dtype: int64

In [8]:
from sklearn.model_selection import train_test_split
X = name_data['name'].astype(str)
Y = name_data['label']
train_names,test_names,train_labels,test_labels = train_test_split(X,Y,test_size=0.2,random_state =42,stratify=Y)


# Using Naive Bayes with Count Vectorizer for name classification

In [None]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer
from sklearn.metrics import confusion_matrix,classification_report

vectorizer = CountVectorizer()
X_ = vectorizer.fit_transform(train_names.values.astype('U'))
len(vectorizer.get_feature_names())


11435

In [None]:
model = MultinomialNB()
model.fit(X_,train_labels)

X_test = vectorizer.transform(test_names.values.astype('U'))

test_predicted = model.predict(X_test)

print(classification_report(test_labels,test_predicted))

              precision    recall  f1-score   support

      indian       0.99      0.77      0.87      2751
  non_indian       0.82      0.99      0.90      2912

    accuracy                           0.89      5663
   macro avg       0.91      0.88      0.88      5663
weighted avg       0.90      0.89      0.88      5663



**Testing on new Names**
<br>
Lets curate some names which are not present in the data at all. And check the model on these names

In [None]:
check_new_names = ['lalitha','tyson','shailaja','shyamala','vishwanathan','ramanujam','conan','kryslovsky',
'ratnani','diego','kakoli','shreyas','brayden','shanon']

X_new = vectorizer.transform(check_new_names)
predictions_nb_cv = model.predict(X_new)
test = pd.DataFrame({'names':check_new_names,'predictions_nb_cv':predictions_nb_cv}) 
test

Unnamed: 0,names,predictions_nb_cv
0,lalitha,non_indian
1,tyson,non_indian
2,shailaja,non_indian
3,shyamala,non_indian
4,vishwanathan,non_indian
5,ramanujam,non_indian
6,conan,non_indian
7,kryslovsky,non_indian
8,ratnani,non_indian
9,diego,non_indian


Doesnt do well at all ! But thats expected. Now lets try with subword encoding



# Naive Bayes with SentencePiece Embedding

In [None]:
#!pip3 install tokenizers
from tokenizers import ByteLevelBPETokenizer,CharBPETokenizer,SentencePieceBPETokenizer,BertWordPieceTokenizer


f = open("train_names.txt","w")
for each in list(train_names):
	f.write(str(each))
	f.write("\n")

f.close()

tokenizer = SentencePieceBPETokenizer()
tokenizer.train(["./train_names.txt"],vocab_size=2000,min_frequency=2)

encoded_tokens = [tokenizer.encode(str(each)).tokens for each in train_names]
encoded_tokens_test = [tokenizer.encode(str(each)).tokens for each in test_names]

encoded_tokens = [" ".join(each)  for each in encoded_tokens]
encoded_tokens_test = [" ".join(each)  for each in encoded_tokens_test]

encoded_tokens[:10]

['▁ali ▁moham ad',
 '▁ann',
 '▁smt. ▁manju ▁jat',
 '▁ar sh',
 '▁nicholas ▁st ri ck land',
 '▁san osh i',
 '▁brian ▁web ster',
 '▁kul vinder ▁kaur',
 '▁christine ▁l ong',
 '▁daniel']

In [None]:
tfidf_vect = TfidfVectorizer()
X_ = tfidf_vect.fit_transform(encoded_tokens)
len(tfidf_vect.get_feature_names())

model = MultinomialNB()
model.fit(X_,train_labels)

X_test = tfidf_vect.transform(encoded_tokens_test)

test_predicted = model.predict(X_test)

print(classification_report(test_labels,test_predicted))

              precision    recall  f1-score   support

      indian       0.97      0.97      0.97      2751
  non_indian       0.97      0.97      0.97      2912

    accuracy                           0.97      5663
   macro avg       0.97      0.97      0.97      5663
weighted avg       0.97      0.97      0.97      5663



Pretty decent. Now lets check on some new words

In [None]:
encoded_tokens_check = [tokenizer.encode(str(each).lower()).tokens for each in check_new_names]
encoded_tokens_check = [" ".join(each)  for each in encoded_tokens_check]

X_new = tfidf_vect.transform(encoded_tokens_check)
predictions_nb_enc_tf = model.predict(X_new)
test = pd.DataFrame({'names':check_new_names,'predictions_nb_enc_tf':predictions_nb_enc_tf}) 
test

Unnamed: 0,names,predictions_nb_enc_tf
0,lalitha,indian
1,tyson,non_indian
2,shailaja,indian
3,shyamala,indian
4,vishwanathan,indian
5,ramanujam,indian
6,conan,non_indian
7,kryslovsky,non_indian
8,ratnani,indian
9,diego,non_indian


# Lets also check with a Character based encoding with an LSTM model

In [5]:
from keras.utils import to_categorical
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LSTM
from keras.preprocessing import sequence
from keras.preprocessing.text import Tokenizer
from keras.layers.embeddings import Embedding
from keras.utils import to_categorical
import numpy as np
from sklearn.preprocessing import LabelEncoder
from keras.callbacks import Callback
np.random.seed(42)


In [6]:

def char_encoded_representation(data,tokenizer,vocab_size,max_len):
	char_index_sentences = tokenizer.texts_to_sequences(data)
	sequences = [to_categorical(x, num_classes=vocab_size) for x in char_index_sentences]
	X = sequence.pad_sequences(sequences, maxlen=max_len)
	return X


In [9]:
max_len = max([len(str(each)) for each in train_names])
# mapping = get_char_mapping(train_names)
# vocab_size = len(mapping)

tok = Tokenizer(char_level=True)
tok.fit_on_texts(train_names)
vocab_size = len(tok.word_index) + 1
X_train = char_encoded_representation(train_names,tok,vocab_size,max_len)
X_train.shape

(22651, 36, 53)

In [10]:
X_test = char_encoded_representation(test_names,tok,vocab_size,max_len)
X_test.shape


(5663, 36, 53)

In [11]:
le = LabelEncoder()
le.fit(train_labels)
y_train = le.transform(train_labels)
y_test = le.transform(test_labels)


In [None]:
# Model Specification


def build_model(hidden_units,max_len,vocab_size):
	model = Sequential()
	# model.add(Embedding(top_words, embedding_vecor_length, input_length=max_review_length))
	model.add(LSTM(hidden_units,input_shape=(max_len,vocab_size)))
	model.add(Dense(1, activation='sigmoid'))
	model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
	print(model.summary())
	return model

class myCallback(Callback): 
	def __init__(self,X_test,y_test):
		self.X_test = X_test
		self.y_test = y_test
	def on_epoch_end(self, epoch, logs={}): 
		loss,acc = model.evaluate(self.X_test, self.y_test, verbose=0)
		print('\nTesting loss: {}, acc: {}\n'.format(loss, acc))


In [None]:
model = build_model(100,max_len,vocab_size)
model.fit(X_train, y_train, epochs=20, batch_size=64,callbacks=myCallback(X_test,y_test))


Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
lstm_1 (LSTM)                (None, 100)               61600     
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 101       
Total params: 61,701
Trainable params: 61,701
Non-trainable params: 0
_________________________________________________________________
None
Epoch 1/20
Testing loss: 0.3409741520881653, acc: 0.8458414077758789

Epoch 2/20
Testing loss: 0.2621632516384125, acc: 0.8848666548728943

Epoch 3/20
Testing loss: 0.2402869462966919, acc: 0.9011124968528748

Epoch 4/20
Testing loss: 0.1855960190296173, acc: 0.926540732383728

Epoch 5/20
Testing loss: 0.17371733486652374, acc: 0.9334275126457214

Epoch 6/20
Testing loss: 0.15379096567630768, acc: 0.9404909014701843

Epoch 7/20
Testing loss: 0.14643630385398865, acc: 0.9431396722793579

Epoch 8/20
Test

<tensorflow.python.keras.callbacks.History at 0x7fc4cadfc518>

In [None]:
X_predict = char_encoded_representation(check_new_names,mapping)

predictions_prob = model.predict(X_predict)
predictions = np.array(predictions_prob)
predictions[predictions > 0.5] = 1
predictions[predictions <= 0.5] = 0
predictions = np.squeeze(predictions)
predictions_lstm_char = le.inverse_transform(list(predictions.astype(int)))
test = pd.DataFrame({'names':check_new_names,'predictions_lstm_char':predictions_lstm_char}) 
test

Unnamed: 0,names,predictions_lstm_char
0,lalitha,indian
1,tyson,non_indian
2,shailaja,indian
3,shyamala,indian
4,vishwanathan,indian
5,ramanujam,indian
6,conan,non_indian
7,kryslovsky,non_indian
8,ratnani,indian
9,diego,non_indian


# SentencePiece Encoding with LSTM
Lets also check with a encoding using the SentencePiece Encoding we used for Naive Bayes. But now we will use it with an LSTM with a much smaller vocabulary

In [None]:
from tokenizers import ByteLevelBPETokenizer,CharBPETokenizer,SentencePieceBPETokenizer,BertWordPieceTokenizer
vocab_size = 200

tokenizer = SentencePieceBPETokenizer()
tokenizer.train(["./train_names.txt"],vocab_size=vocab_size,min_frequency=2)


def sent_piece_encoded_representation(data,tokenizer):
	encoded_tokens = [tokenizer.encode(str(each)).ids for each in data]
	sequences = [to_categorical(x, num_classes=vocab_size) for x in encoded_tokens]
	X = sequence.pad_sequences(sequences, maxlen=max_len)
	return X

In [None]:
max_len = max([len(str(each)) for each in train_names])
le = LabelEncoder()
le.fit(train_labels)
y_train = le.transform(train_labels)
y_test = le.transform(test_labels)


X_train = sent_piece_encoded_representation(train_names,tokenizer)
X_train.shape


(22651, 36, 200)

In [None]:
X_test = sent_piece_encoded_representation(test_names,tokenizer)
X_test.shape

(5663, 36, 200)

In [None]:
model = build_model(100,max_len,vocab_size)
model.fit(X_train, y_train, epochs=12, batch_size=64,callbacks=myCallback(X_test,y_test))


Model: "sequential_6"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
lstm_6 (LSTM)                (None, 100)               120400    
_________________________________________________________________
dense_6 (Dense)              (None, 1)                 101       
Total params: 120,501
Trainable params: 120,501
Non-trainable params: 0
_________________________________________________________________
None
Epoch 1/12
Testing loss: 0.2177201509475708, acc: 0.9141797423362732

Epoch 2/12
Testing loss: 0.18794947862625122, acc: 0.9214197397232056

Epoch 3/12
Testing loss: 0.1680028885602951, acc: 0.9337806701660156

Epoch 4/12
Testing loss: 0.15903355181217194, acc: 0.9355465173721313

Epoch 5/12
Testing loss: 0.151988223195076, acc: 0.9396079778671265

Epoch 6/12
Testing loss: 0.14407828450202942, acc: 0.9441992044448853

Epoch 7/12
Testing loss: 0.13729199767112732, acc: 0.9496732950210571

Epoch 8/12
T

<tensorflow.python.keras.callbacks.History at 0x7fc4cd2f8d30>

In [None]:
X_predict = sent_piece_encoded_representation(check_new_names,tokenizer)

predictions_prob = model.predict(X_predict)
predictions = np.array(predictions_prob)
predictions[np.where(predictions > 0.5)[0]] = 1
predictions[np.where(predictions <= 0.5)[0]] = 0
predictions = np.squeeze(predictions)

predictions_lstm_sent_enc = le.inverse_transform(list(predictions.astype(int)))
test = pd.DataFrame({'names':check_new_names,'predictions_lstm_sent_enc':predictions_lstm_sent_enc}) 
test

Unnamed: 0,names,predictions_lstm_sent_enc
0,lalitha,indian
1,tyson,non_indian
2,shailaja,non_indian
3,shyamala,indian
4,vishwanathan,indian
5,ramanujam,indian
6,conan,non_indian
7,kryslovsky,non_indian
8,ratnani,indian
9,diego,non_indian
