In [1]:
import keras
import tensorflow as tf
print(keras.__version__)
print(tf.__version__)

2.4.3
2.4.1


In [2]:
from __future__ import print_function

import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report,confusion_matrix

NGRAMS = 2
EPOCHS = 15

# Wikilabels
df = pd.read_csv('../../data/wiki/wiki_name_race.csv')
df.dropna(subset=['name_first', 'name_last'], inplace=True)
sdf = df

# Additional features
sdf['name_first'] = sdf.name_first.str.title()
sdf['name_last'] = sdf.name_last.str.title()

sdf.groupby('race').agg({'name_first': 'count'})

Unnamed: 0_level_0,name_first
race,Unnamed: 1_level_1
"Asian,GreaterEastAsian,EastAsian",5497
"Asian,GreaterEastAsian,Japanese",7333
"Asian,IndianSubContinent",7861
"GreaterAfrican,Africans",3672
"GreaterAfrican,Muslim",6242
"GreaterEuropean,British",41445
"GreaterEuropean,EastEuropean",8329
"GreaterEuropean,Jewish",10239
"GreaterEuropean,WestEuropean,French",12293
"GreaterEuropean,WestEuropean,Germanic",3869


## Preprocessing the input data

In [3]:
# concat last name and first name
sdf['name_last_name_first'] = sdf['name_last'] + ' ' + sdf['name_first']

# build n-gram list
vect = CountVectorizer(analyzer='char', max_df=0.3, min_df=3, ngram_range=(NGRAMS, NGRAMS), lowercase=False) 
a = vect.fit_transform(sdf.name_last_name_first)
vocab = vect.vocabulary_

# sort n-gram by freq (highest -> lowest)
words = []
for b in vocab:
    c = vocab[b]
    #print(b, c, a[:, c].sum())
    words.append((a[:, c].sum(), b))
    #break
words = sorted(words, reverse=True)
words_list = [w[1] for w in words]
num_words = len(words_list)
print("num_words = %d" % num_words)


def find_ngrams(text, n):
    a = zip(*[text[i:] for i in range(n)])
    wi = []
    for i in a:
        w = ''.join(i)
        try:
            idx = words_list.index(w)
        except:
            idx = 0
        wi.append(idx)
    return wi

# build X from index of n-gram sequence
X = np.array(sdf.name_last_name_first.apply(lambda c: find_ngrams(c, NGRAMS)))

# check max/avg feature
X_len = []
for x in X:
    X_len.append(len(x))

max_feature_len = max(X_len)
avg_feature_len = int(np.mean(X_len))

print("Max feature len = %d, Avg. feature len = %d" % (max_feature_len, avg_feature_len))
y = np.array(sdf.race.astype('category').cat.codes)

# Split train and test dataset
X_train,  X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=21, stratify=y)

num_words = 2210
Max feature len = 74, Avg. feature len = 12


## Train a LSTM model

ref: http://machinelearningmastery.com/sequence-classification-lstm-recurrent-neural-networks-python-keras/

In [4]:
'''The dataset is actually too small for LSTM to be of any advantage
compared to simpler, much faster methods such as TF-IDF + LogReg.
Notes:

- RNNs are tricky. Choice of batch size is important,
choice of loss and optimizer is critical, etc.
Some configurations won't converge.

- LSTM loss decrease patterns during training can be quite different
from what you see with CNNs/MLPs/etc.
'''
import keras
from keras.preprocessing import sequence
from keras.models import Sequential
from keras.layers import Dense, Embedding, Dropout, Activation
from keras.layers import LSTM
from keras.layers.convolutional import Conv1D
from keras.layers.convolutional import MaxPooling1D
from keras.models import load_model

max_features = num_words # 20000
feature_len = 25 # avg_feature_len # cut texts after this number of words (among top max_features most common words)
batch_size = 32

print(len(X_train), 'train sequences')
print(len(X_test), 'test sequences')

print('Pad sequences (samples x time)')
X_train = sequence.pad_sequences(X_train, maxlen=feature_len)
X_test = sequence.pad_sequences(X_test, maxlen=feature_len)
print('X_train shape:', X_train.shape)
print('X_test shape:', X_test.shape)

num_classes = np.max(y_train) + 1
print(num_classes, 'classes')

print('Convert class vector to binary class matrix '
      '(for use with categorical_crossentropy)')
y_train = keras.utils.to_categorical(y_train, num_classes)
y_test = keras.utils.to_categorical(y_test, num_classes)
print('y_train shape:', y_train.shape)
print('y_test shape:', y_test.shape)

107097 train sequences
26775 test sequences
Pad sequences (samples x time)
X_train shape: (107097, 25)
X_test shape: (26775, 25)
13 classes
Convert class vector to binary class matrix (for use with categorical_crossentropy)
y_train shape: (107097, 13)
y_test shape: (26775, 13)


In [5]:
print('Build model...')

model = Sequential()
model.add(Embedding(num_words, 32, input_length=feature_len))
model.add(LSTM(128, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(num_classes, activation='sigmoid'))

# try using different optimizers and different optimizer configs
model.compile(loss='categorical_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])

print(model.summary())

Build model...
Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 25, 32)            70720     
_________________________________________________________________
lstm (LSTM)                  (None, 128)               82432     
_________________________________________________________________
dense (Dense)                (None, 13)                1677      
Total params: 154,829
Trainable params: 154,829
Non-trainable params: 0
_________________________________________________________________
None


In [6]:
print('Train...')
model.fit(X_train, y_train, batch_size=batch_size, epochs=EPOCHS,
          validation_split=0.1, verbose=2)
score, acc = model.evaluate(X_test, y_test,
                            batch_size=batch_size, verbose=2)
print('Test score:', score)
print('Test accuracy:', acc)

Train...
Epoch 1/15
3013/3013 - 265s - loss: 1.4050 - accuracy: 0.5749 - val_loss: 1.1441 - val_accuracy: 0.6616
Epoch 2/15
3013/3013 - 227s - loss: 1.1184 - accuracy: 0.6725 - val_loss: 1.0773 - val_accuracy: 0.6884
Epoch 3/15
3013/3013 - 237s - loss: 1.0531 - accuracy: 0.6938 - val_loss: 1.0307 - val_accuracy: 0.7005
Epoch 4/15
3013/3013 - 239s - loss: 1.0128 - accuracy: 0.7066 - val_loss: 1.0081 - val_accuracy: 0.7088
Epoch 5/15
3013/3013 - 242s - loss: 0.9815 - accuracy: 0.7150 - val_loss: 0.9957 - val_accuracy: 0.7144
Epoch 6/15
3013/3013 - 243s - loss: 0.9569 - accuracy: 0.7232 - val_loss: 0.9867 - val_accuracy: 0.7192
Epoch 7/15
3013/3013 - 244s - loss: 0.9341 - accuracy: 0.7292 - val_loss: 0.9816 - val_accuracy: 0.7204
Epoch 8/15
3013/3013 - 243s - loss: 0.9168 - accuracy: 0.7341 - val_loss: 0.9701 - val_accuracy: 0.7226
Epoch 9/15
3013/3013 - 243s - loss: 0.8989 - accuracy: 0.7397 - val_loss: 0.9661 - val_accuracy: 0.7259
Epoch 10/15
3013/3013 - 233s - loss: 0.8823 - accuracy:

## Confusion Matrix

In [7]:
y_pred = model.predict_classes(X_test, verbose=2)
p = model.predict_proba(X_test, verbose=2) # to predict probability
target_names = list(sdf.race.astype('category').cat.categories)
print(classification_report(np.argmax(y_test, axis=1), y_pred, target_names=target_names))
print(confusion_matrix(np.argmax(y_test, axis=1), y_pred))



837/837 - 8s




837/837 - 7s
                                       precision    recall  f1-score   support

     Asian,GreaterEastAsian,EastAsian       0.87      0.78      0.82      1099
      Asian,GreaterEastAsian,Japanese       0.90      0.89      0.90      1467
             Asian,IndianSubContinent       0.78      0.76      0.77      1572
              GreaterAfrican,Africans       0.56      0.40      0.46       734
                GreaterAfrican,Muslim       0.67      0.69      0.68      1248
              GreaterEuropean,British       0.75      0.89      0.81      8289
         GreaterEuropean,EastEuropean       0.79      0.75      0.77      1666
               GreaterEuropean,Jewish       0.52      0.41      0.46      2048
  GreaterEuropean,WestEuropean,French       0.73      0.57      0.64      2459
GreaterEuropean,WestEuropean,Germanic       0.47      0.45      0.46       774
GreaterEuropean,WestEuropean,Hispanic       0.71      0.70      0.70      2082
 GreaterEuropean,WestEuropean,Italian 

## Save model

In [8]:
model.save('./wiki/lstm/wiki_name_lstm.h5')

In [9]:
words_df = pd.DataFrame(words_list, columns=['vocab'])
words_df.to_csv('./wiki/lstm/wiki_name_vocab.csv', index=False, encoding='utf-8')