In [1]:
import keras
import tensorflow as tf
print(keras.__version__)
print(tf.__version__)

2021-12-21 08:19:30.746483: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2021-12-21 08:19:30.746531: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.


2.5.0
2.5.2


In [2]:
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report,confusion_matrix

NGRAMS = 2
SAMPLE = 1000000
EPOCHS = 20

# Florida voter
df = pd.read_csv('/opt/data/fl_voterreg/fl_reg_name_race.csv.gz')
df.dropna(subset=['name_first', 'name_last'], inplace=True)
df['race'] = df.race.map({'native_indian': 'other', 'asian': 'asian', 'nh_black': 'nh_black', 'hispanic': 'hispanic', 'nh_white': 'nh_white', 'other': 'other', 'multi_racial': 'other', 'unknown': 'unknown'})
df

Unnamed: 0,name_last,name_first,race
0,Walker,Elizabeth,nh_white
1,Palmer,Alton,nh_white
2,Mc Cleod,Alicia,nh_black
3,Scarborough,Dale,nh_white
4,Walker,Daniel,nh_white
...,...,...,...
13710353,Walters,William,nh_white
13710354,Sawyer,Matthew,nh_white
13710355,Thomas,Janine,nh_white
13710356,Campbell,Angel,other


In [3]:
sdf = df[df.race.isin(['unknown']) == False].groupby(['race']).sample(int(SAMPLE/5), random_state=21)
del df

# Additional features
sdf['name_last'] = sdf.name_last.str.title()
sdf

Unnamed: 0,name_last,name_first,race
5731290,Rao,Kranthi,asian
3915605,Hossain,Tanjim,asian
689897,Nguyen,Bao,asian
8981436,Samaroo,RYAN,asian
11674974,Ramnauth,RYAN,asian
...,...,...,...
11684362,Lewis,Brianna,other
5329148,Hughs,JAMES,other
7712522,Times,Douglas,other
13637468,Knutson,Elizabeth,other


In [4]:
rdf = sdf.groupby('race').agg({'name_last': 'count'})
rdf.to_csv('./fl_voter_reg/lstm/fl_ln_five_cat_race.csv', columns=[])
rdf

Unnamed: 0_level_0,name_last
race,Unnamed: 1_level_1
asian,200000
hispanic,200000
nh_black,200000
nh_white,200000
other,200000


In [5]:
sdf.groupby('race').agg({'name_last': 'nunique'})

Unnamed: 0_level_0,name_last
race,Unnamed: 1_level_1
asian,49168
hispanic,47270
nh_black,28558
nh_white,64985
other,65497


## Preprocessing the input data

In [6]:
# last name only
sdf['name_last_name_first'] = sdf['name_last']

# build n-gram list
vect = CountVectorizer(analyzer='char', max_df=0.3, min_df=3, ngram_range=(NGRAMS, NGRAMS), lowercase=False) 
a = vect.fit_transform(sdf.name_last_name_first)
vocab = vect.vocabulary_

# sort n-gram by freq (highest -> lowest)
words = []
for b in vocab:
    c = vocab[b]
    #print(b, c, a[:, c].sum())
    words.append((a[:, c].sum(), b))
    #break
words = sorted(words, reverse=True)
words_list = ['UNK']
words_list.extend([w[1] for w in words])
num_words = len(words_list)
print("num_words = %d" % num_words)


def find_ngrams(text, n):
    a = zip(*[text[i:] for i in range(n)])
    wi = []
    for i in a:
        w = ''.join(i)
        try:
            idx = words_list.index(w)
        except:
            idx = 0
        wi.append(idx)
    return wi

# build X from index of n-gram sequence
X = np.array(sdf.name_last_name_first.apply(lambda c: find_ngrams(c, NGRAMS)))

# check max/avg feature
X_len = []
for x in X:
    X_len.append(len(x))

max_feature_len = max(X_len)
avg_feature_len = int(np.mean(X_len))

print("Max feature len = %d, Avg. feature len = %d" % (max_feature_len, avg_feature_len))
y = np.array(sdf.race.astype('category').cat.codes)

# Split train and test dataset
X_train,  X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=21, stratify=y)

num_words = 1196
Max feature len = 27, Avg. feature len = 5


## Train a LSTM model

ref: http://machinelearningmastery.com/sequence-classification-lstm-recurrent-neural-networks-python-keras/

In [7]:
'''The dataset is actually too small for LSTM to be of any advantage
compared to simpler, much faster methods such as TF-IDF + LogReg.
Notes:

- RNNs are tricky. Choice of batch size is important,
choice of loss and optimizer is critical, etc.
Some configurations won't converge.

- LSTM loss decrease patterns during training can be quite different
from what you see with CNNs/MLPs/etc.
'''
from keras.preprocessing import sequence
from keras.models import Sequential
from keras.layers import Dense, Embedding, Dropout, Activation
from keras.layers import LSTM
from keras.layers.convolutional import Conv1D
from keras.layers.convolutional import MaxPooling1D
from keras.models import load_model

max_features = num_words # 20000
feature_len = 20 # avg_feature_len # cut texts after this number of words (among top max_features most common words)
batch_size = 32

print(len(X_train), 'train sequences')
print(len(X_test), 'test sequences')

print('Pad sequences (samples x time)')
X_train = sequence.pad_sequences(X_train, maxlen=feature_len)
X_test = sequence.pad_sequences(X_test, maxlen=feature_len)
print('X_train shape:', X_train.shape)
print('X_test shape:', X_test.shape)

num_classes = np.max(y_train) + 1
print(num_classes, 'classes')

print('Convert class vector to binary class matrix '
      '(for use with categorical_crossentropy)')
y_train = tf.keras.utils.to_categorical(y_train, num_classes)
y_test = tf.keras.utils.to_categorical(y_test, num_classes)
print('y_train shape:', y_train.shape)
print('y_test shape:', y_test.shape)

800000 train sequences
200000 test sequences
Pad sequences (samples x time)
X_train shape: (800000, 20)
X_test shape: (200000, 20)
5 classes
Convert class vector to binary class matrix (for use with categorical_crossentropy)
y_train shape: (800000, 5)
y_test shape: (200000, 5)


In [8]:
print('Build model...')

model = Sequential()
model.add(Embedding(num_words, 32, input_length=feature_len))
model.add(LSTM(128, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(num_classes, activation='softmax'))

# try using different optimizers and different optimizer configs
model.compile(loss='categorical_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])

print(model.summary())

Build model...
Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 20, 32)            38272     
_________________________________________________________________
lstm (LSTM)                  (None, 128)               82432     
_________________________________________________________________
dense (Dense)                (None, 5)                 645       
Total params: 121,349
Trainable params: 121,349
Non-trainable params: 0
_________________________________________________________________
None


2021-12-21 08:21:05.800031: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcuda.so.1'; dlerror: libcuda.so.1: cannot open shared object file: No such file or directory
2021-12-21 08:21:05.800072: W tensorflow/stream_executor/cuda/cuda_driver.cc:326] failed call to cuInit: UNKNOWN ERROR (303)
2021-12-21 08:21:05.800090: I tensorflow/stream_executor/cuda/cuda_diagnostics.cc:156] kernel driver does not appear to be running on this host (jupyter-server-2): /proc/driver/nvidia/version does not exist
2021-12-21 08:21:05.800295: I tensorflow/core/platform/cpu_feature_guard.cc:142] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [9]:
print('Train...')
model.fit(X_train, y_train, batch_size=batch_size, epochs=EPOCHS,
          validation_split=0.1, verbose=1)
score, acc = model.evaluate(X_test, y_test,
                            batch_size=batch_size, verbose=1)
print('Test score:', score)
print('Test accuracy:', acc)

Train...


2021-12-21 08:21:06.163286: I tensorflow/compiler/mlir/mlir_graph_optimization_pass.cc:176] None of the MLIR Optimization Passes are enabled (registered 2)
2021-12-21 08:21:06.163898: I tensorflow/core/platform/profile_utils/cpu_utils.cc:114] CPU Frequency: 2250000000 Hz


Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
Test score: 1.094052791595459
Test accuracy: 0.5678849816322327


In [10]:
print('Test score:', score)
print('Test accuracy:', acc)

Test score: 1.094052791595459
Test accuracy: 0.5678849816322327


## Confusion Matrix

In [11]:
p = model.predict(X_test, verbose=2) # to predict probability
y_pred = np.argmax(p, axis=-1)
target_names = list(sdf.race.astype('category').cat.categories)
print(classification_report(np.argmax(y_test, axis=1), y_pred, target_names=target_names))
print(confusion_matrix(np.argmax(y_test, axis=1), y_pred))

6250/6250 - 27s
              precision    recall  f1-score   support

       asian       0.71      0.63      0.67     40000
    hispanic       0.72      0.83      0.77     40000
    nh_black       0.49      0.78      0.60     40000
    nh_white       0.44      0.53      0.48     40000
       other       0.38      0.06      0.11     40000

    accuracy                           0.57    200000
   macro avg       0.55      0.57      0.53    200000
weighted avg       0.55      0.57      0.53    200000

[[25339  3753  3974  5391  1543]
 [ 1313 33307  1754  2983   643]
 [  945   834 31071  6463   687]
 [ 1698  2414 13220 21288  1380]
 [ 6515  5687 13248 11978  2572]]


## Save model

In [12]:
model.save('./fl_voter_reg/lstm/fl_all_ln_lstm_5_cat.h5')

In [13]:
words_df = pd.DataFrame(words_list, columns=['vocab'])
words_df.to_csv('./fl_voter_reg/lstm/fl_all_ln_vocab_5_cat.csv', index=False, encoding='utf-8')