## Ethnicity Classification based on names
### Names Dataset for 4 classes from https://mbejda.github.io/

In [None]:
import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense, Embedding, LSTM, SpatialDropout1D
from sklearn.model_selection import train_test_split
from keras.utils.np_utils import to_categorical
from keras.callbacks import EarlyStopping
from keras.layers import Dropout
import re
from nltk.corpus import stopwords
from nltk import word_tokenize
STOPWORDS = set(stopwords.words('english'))
import string
from sklearn.utils import shuffle


In [None]:
df = pd.read_csv('test.csv')
print(df.head())
print(df.info())


In [3]:
#removing single letter initials from fname

df["fname"] = df["fname"].str.strip()
df = df[df.fname.str.count(' ') <3] #keeping only max 3 word strings
df['fname'] = df['fname'].str.split().map(lambda sl: " ".join(s for s in sl if len(s) > 2)) #keeping only strings which are >2 in lenght
df = df[df.fname.str.count(' ') == 0] #Only keeping single word strings for better accuracy


In [4]:
df.race.value_counts()

white       48115
black       36799
indian      16533
hispanic     4345
Name: race, dtype: int64

In [5]:
#Concatinating into full name for better data to be fed into the lstm
df['fullname'] = df.lname.astype(str).str.cat(df.fname.astype(str), sep=' ')

In [6]:
df.head()

Unnamed: 0,lname,fname,race,fullname
0,abraham,tashanika,black,abraham tashanika
1,adams,denetra,black,adams denetra
2,adams,tomesha,black,adams tomesha
3,adams,trellany,black,adams trellany
4,adderley,cynthia,black,adderley cynthia


In [7]:
#Some cleaning on the fullname just in case

df = df.reset_index(drop=True)
REPLACE_BY_SPACE_RE = re.compile('[/(){}\[\]\|@,;.]')
BAD_SYMBOLS_RE = re.compile('[^0-9a-z #+_]')
STOPWORDS = set(stopwords.words('english'))

def clean_text(text):
  
    text = text.lower() # lowercase text
    text = REPLACE_BY_SPACE_RE.sub(' ', text) # replace REPLACE_BY_SPACE_RE symbols by space in text. substitute the matched string in REPLACE_BY_SPACE_RE with space.
    text = BAD_SYMBOLS_RE.sub('', text) # remove symbols which are in BAD_SYMBOLS_RE from text. substitute the matched string in BAD_SYMBOLS_RE with nothing. 
    text = text.replace('x', '')
    text = ' '.join(word for word in text.split() if word not in STOPWORDS) # remove stopwors from text
    return text
df['fullname'] = df['fullname'].apply(clean_text)
df['fullname'] = df['fullname'].str.replace('\d+', '')
print(df.head())
print(df.fullname.str.len().max())

      lname      fname   race           fullname
0   abraham  tashanika  black  abraham tashanika
1     adams    denetra  black      adams denetra
2     adams    tomesha  black      adams tomesha
3     adams   trellany  black     adams trellany
4  adderley    cynthia  black   adderley cynthia
36


In [8]:
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report,confusion_matrix

NGRAMS = 2
SAMPLE = 1000000
EPOCHS = 15

# build n-gram list
vect = CountVectorizer(analyzer='char', max_df=0.3, min_df=3, ngram_range=(NGRAMS, NGRAMS), lowercase=False) 
a = vect.fit_transform(df.fullname)
vocab = vect.vocabulary_

# sort n-gram by freq (highest -> lowest)
words = []
for b in vocab:
    c = vocab[b]
    #print(b, c, a[:, c].sum())
    words.append((a[:, c].sum(), b))
    #break
words = sorted(words, reverse=True)
words_list = [w[1] for w in words]
num_words = len(words_list)
print("num_words = %d" % num_words)


def find_ngrams(text, n):
    a = zip(*[text[i:] for i in range(n)])
    wi = []
    for i in a:
        w = ''.join(i)
        try:
            idx = words_list.index(w)
        except:
            idx = 0
        wi.append(idx)
    return wi

# build X from index of n-gram sequence
X = np.array(df.fullname.apply(lambda c: find_ngrams(c, NGRAMS)))

# check max/avg feature
X_len = []
for x in X:
    X_len.append(len(x))

max_feature_len = max(X_len)
avg_feature_len = int(np.mean(X_len))

print("Max feature len = %d, Avg. feature len = %d" % (max_feature_len, avg_feature_len))
y = np.array(df.race.astype('category').cat.codes)

# Split train and test dataset
X_train,  X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=21, stratify=y)

num_words = 605
Max feature len = 35, Avg. feature len = 11


In [9]:
'''The dataset is actually too small for LSTM to be of any advantage
compared to simpler, much faster methods such as TF-IDF + LogReg.
Notes:

- RNNs are tricky. Choice of batch size is important,
choice of loss and optimizer is critical, etc.
Some configurations won't converge.

- LSTM loss decrease patterns during training can be quite different
from what you see with CNNs/MLPs/etc.
'''
import keras
from keras.preprocessing import sequence
from keras.models import Sequential
from keras.layers import Dense, Embedding, Dropout, Activation
from keras.layers import LSTM
from keras.layers.convolutional import Conv1D
from keras.layers.convolutional import MaxPooling1D
from keras.models import load_model

max_features = num_words # 20000
feature_len = 25 # avg_feature_len # cut texts after this number of words (among top max_features most common words)
batch_size = 32

print(len(X_train), 'train sequences')
print(len(X_test), 'test sequences')

print('Pad sequences (samples x time)')
X_train = sequence.pad_sequences(X_train, maxlen=feature_len)
X_test = sequence.pad_sequences(X_test, maxlen=feature_len)
print('X_train shape:', X_train.shape)
print('X_test shape:', X_test.shape)

num_classes = np.max(y_train) + 1
print(num_classes, 'classes')

print('Convert class vector to binary class matrix '
      '(for use with categorical_crossentropy)')
y_train = keras.utils.to_categorical(y_train, num_classes)
y_test = keras.utils.to_categorical(y_test, num_classes)
print('y_train shape:', y_train.shape)
print('y_test shape:', y_test.shape)

84633 train sequences
21159 test sequences
Pad sequences (samples x time)
X_train shape: (84633, 25)
X_test shape: (21159, 25)
4 classes
Convert class vector to binary class matrix (for use with categorical_crossentropy)
y_train shape: (84633, 4)
y_test shape: (21159, 4)


In [10]:
print('Build model...')

if False:
    model = Sequential()
    model.add(Embedding(num_words, 32, input_length=feature_len))
    model.add(LSTM(128, dropout=0.2, recurrent_dropout=0.2))
    model.add(Dense(num_classes, activation='softmax'))
if True:
    embedding_vecor_length = 32
    model = Sequential()
    model.add(Embedding(num_words, embedding_vecor_length, input_length=feature_len))
    model.add(Conv1D(activation="relu", padding="same", filters=32, kernel_size=3))
    model.add(MaxPooling1D(pool_size=2))
    model.add(LSTM(100))
    model.add(Dense(num_classes, activation='softmax'))

# try using different optimizers and different optimizer configs
model.compile(loss='categorical_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])

print(model.summary())

W1011 14:25:22.401192  6460 deprecation_wrapper.py:119] From C:\ProgramData\Anaconda3\lib\site-packages\keras\backend\tensorflow_backend.py:74: The name tf.get_default_graph is deprecated. Please use tf.compat.v1.get_default_graph instead.

W1011 14:25:22.424132  6460 deprecation_wrapper.py:119] From C:\ProgramData\Anaconda3\lib\site-packages\keras\backend\tensorflow_backend.py:517: The name tf.placeholder is deprecated. Please use tf.compat.v1.placeholder instead.

W1011 14:25:22.427124  6460 deprecation_wrapper.py:119] From C:\ProgramData\Anaconda3\lib\site-packages\keras\backend\tensorflow_backend.py:4138: The name tf.random_uniform is deprecated. Please use tf.random.uniform instead.

W1011 14:25:22.471040  6460 deprecation_wrapper.py:119] From C:\ProgramData\Anaconda3\lib\site-packages\keras\backend\tensorflow_backend.py:3976: The name tf.nn.max_pool is deprecated. Please use tf.nn.max_pool2d instead.



Build model...


W1011 14:25:22.735329  6460 deprecation_wrapper.py:119] From C:\ProgramData\Anaconda3\lib\site-packages\keras\optimizers.py:790: The name tf.train.Optimizer is deprecated. Please use tf.compat.v1.train.Optimizer instead.

W1011 14:25:22.763223  6460 deprecation_wrapper.py:119] From C:\ProgramData\Anaconda3\lib\site-packages\keras\backend\tensorflow_backend.py:3295: The name tf.log is deprecated. Please use tf.math.log instead.



_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 25, 32)            19360     
_________________________________________________________________
conv1d_1 (Conv1D)            (None, 25, 32)            3104      
_________________________________________________________________
max_pooling1d_1 (MaxPooling1 (None, 12, 32)            0         
_________________________________________________________________
lstm_1 (LSTM)                (None, 100)               53200     
_________________________________________________________________
dense_1 (Dense)              (None, 4)                 404       
Total params: 76,068
Trainable params: 76,068
Non-trainable params: 0
_________________________________________________________________
None


In [14]:
print('Train...')
model.fit(X_train, y_train, batch_size=batch_size, epochs=EPOCHS,
          validation_split=0.1, verbose=1)
score, acc = model.evaluate(X_test, y_test,
                            batch_size=batch_size, verbose=2)
print('Test score:', score)
print('Test accuracy:', acc)

Train...
Train on 76169 samples, validate on 8464 samples
Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15
Test score: 0.49694026006665887
Test accuracy: 0.77101942436688


In [15]:
print('Test score:', score)
print('Test accuracy:', acc)

Test score: 0.49694026006665887
Test accuracy: 0.77101942436688


In [16]:
y_pred = model.predict_classes(X_test, verbose=2)
p = model.predict_proba(X_test, verbose=2) # to predict probability
target_names = list(df.race.astype('category').cat.categories)
print(classification_report(np.argmax(y_test, axis=1), y_pred, target_names=target_names))
print(confusion_matrix(np.argmax(y_test, axis=1), y_pred))

              precision    recall  f1-score   support

       black       0.76      0.68      0.72      7360
    hispanic       0.38      0.22      0.28       869
      indian       1.00      1.00      1.00      3307
       white       0.72      0.81      0.77      9623

    accuracy                           0.77     21159
   macro avg       0.72      0.68      0.69     21159
weighted avg       0.77      0.77      0.77     21159

[[5009   35    0 2316]
 [  31  191    0  647]
 [   0    0 3306    1]
 [1542  271    2 7808]]


In [17]:
model.save('ethnicolr_approach.h5')

In [18]:
words_df = pd.DataFrame(words_list, columns=['vocab'])
words_df.to_csv('name_vocab.csv', index=False, encoding='utf-8')

In [19]:
unique_elements, counts_elements = np.unique(y, return_counts=True)
print("Frequency of unique values of the said array:")
print(np.asarray((unique_elements, counts_elements)))

Frequency of unique values of the said array:
[[    0     1     2     3]
 [36799  4345 16533 48115]]


In [20]:
list(df.race.astype('category').cat.categories)

['black', 'hispanic', 'indian', 'white']

In [21]:

def find_ngrams(vocab, text, n):
    """Find and return list of the index of n-grams in the vocabulary list.
    Generate the n-grams of the specific text, find them in the vocabulary list
    and return the list of index have been found.
    Args:
        vocab (:obj:`list`): Vocabulary list.
        text (str): Input text
        n (int): N-grams
    Returns:
        list: List of the index of n-grams in the vocabulary list.
    """

    wi = []

    if not isinstance(text, str):
        return wi

    a = zip(*[text[i:] for i in range(n)])
    for i in a:
        w = ''.join(i)
        try:
            idx = vocab.index(w)
        except Exception as e:
            idx = 0
        wi.append(idx)
    return wi

In [22]:
vdf = pd.read_csv("pred/name_vocab.csv")
vocab = vdf.vocab.tolist()

rdf = pd.read_csv("pred/race.csv")
race = rdf.race.tolist()

model = load_model("pred/ethnicolr_approach.h5")

names = ["narendra modi","donald trump","parth agrawal","raul feliciano","rinku"]

df = pd.DataFrame(names,columns =['names']) 

X = np.array(df.names.apply(lambda c:find_ngrams(vocab,c, 2)))
X = sequence.pad_sequences(X, maxlen=25)
model.predict_classes(X, verbose=2)


array([3, 0, 0, 3, 3], dtype=int64)

In [23]:
nn = df['names'].notnull()
df.loc[nn, 'pred'] = model.predict_classes(X, verbose=2)

df.loc[nn, 'predicted_race'] = df[nn]['pred'].apply(lambda c:
                                            race[int(c)])
del df['pred']
df['predicted_race']


0    white
1    black
2    black
3    white
4    white
Name: predicted_race, dtype: object

In [24]:
proba = model.predict_proba(X, verbose=2)

pdf = pd.DataFrame(proba, columns=race)
pdf.set_index(df[nn].index, inplace=True)

rdf = pd.concat([df, pdf], axis=1)

In [25]:
rdf

Unnamed: 0,names,predicted_race,black,hispanic,indian,white
0,narendra modi,white,0.056661,0.160956,1.6e-05,0.782368
1,donald trump,black,0.8556,0.001394,2.8e-05,0.142978
2,parth agrawal,black,0.772356,0.002073,1.8e-05,0.225553
3,raul feliciano,white,0.033335,0.364241,1.6e-05,0.602407
4,rinku,white,0.19824,0.111147,0.061857,0.628756
