In [1]:
from tensorflow import keras
import tensorflow as tf
print(keras.__version__)
print(tf.__version__)

2.6.0
2.6.0


In [2]:
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report,confusion_matrix

NGRAMS = 2
SAMPLE = 1000000
EPOCHS = 15

# Florida voter
df = pd.read_csv('../dataverse_files/fl_reg_name_race.csv.gz')
df.dropna(subset=['name_first', 'name_last'], inplace=True)
sdf = df[df.race.isin(['multi_racial', 'native_indian', 'other', 'unknown']) == False].sample(SAMPLE, random_state=21)
del df

# Additional features
sdf['name_last'] = sdf.name_last.str.title()

sdf

Unnamed: 0,name_last,name_first,race
841323,Torres,Jose,hispanc
1408926,Da Silva,Amanda,nh_white
1733118,Mc Ghee,Sandra,nh_white
13104513,Karam,MELINDA,nh_white
9156114,Brewer,LAIA,nh_black
...,...,...,...
3076722,Antunez Avila,Robert,hispanc
10023679,Davis,WYATT,nh_white
5846252,Scott,Jacquelyn,nh_white
5959131,Parton,Douglas,nh_white


In [3]:
rdf = sdf.groupby('race').agg({'name_last': 'count'})
rdf.to_csv('../dataverse_files/fl_voter_reg/lstm/fl_ln_race.csv', columns=[])
rdf

Unnamed: 0_level_0,name_last
race,Unnamed: 1_level_1
asian,19431
hispanc,166865
nh_black,142675
nh_white,671029


In [4]:
sdf.groupby('race').agg({'name_last': 'nunique'})

Unnamed: 0_level_0,name_last
race,Unnamed: 1_level_1
asian,9364
hispanc,41721
nh_black,23079
nh_white,145852


## Preprocessing the input data

In [5]:
sdf.head()

Unnamed: 0,name_last,name_first,race
841323,Torres,Jose,hispanc
1408926,Da Silva,Amanda,nh_white
1733118,Mc Ghee,Sandra,nh_white
13104513,Karam,MELINDA,nh_white
9156114,Brewer,LAIA,nh_black


In [6]:
# last name only
sdf['name_last_name_first'] = sdf['name_last']

# build n-gram list
vect = CountVectorizer(analyzer='char', max_df=0.3, min_df=3, ngram_range=(NGRAMS, NGRAMS), lowercase=False) 
a = vect.fit_transform(sdf.name_last_name_first)
vocab = vect.vocabulary_

# sort n-gram by freq (highest -> lowest)
words = []
for b in vocab:
    c = vocab[b]
    #print(b, c, a[:, c].sum())
    words.append((a[:, c].sum(), b))
    #break
words = sorted(words, reverse=True)
words_list = [w[1] for w in words]
num_words = len(words_list)
print("num_words = %d" % num_words)


def find_ngrams(text, n):
    a = zip(*[text[i:] for i in range(n)])
    wi = []
    for i in a:
        w = ''.join(i)
        try:
            idx = words_list.index(w)
        except:
            idx = 0
        wi.append(idx)
    return wi

# build X from index of n-gram sequence
X = np.array(sdf.name_last_name_first.apply(lambda c: find_ngrams(c, NGRAMS)))

# check max/avg feature
X_len = []
for x in X:
    X_len.append(len(x))

max_feature_len = max(X_len)
avg_feature_len = int(np.mean(X_len))

print("Max feature len = %d, Avg. feature len = %d" % (max_feature_len, avg_feature_len))
y = np.array(sdf.race.astype('category').cat.codes)

# Split train and test dataset
X_train,  X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=21, stratify=y)

num_words = 1166
Max feature len = 26, Avg. feature len = 5


## Train a LSTM model

ref: http://machinelearningmastery.com/sequence-classification-lstm-recurrent-neural-networks-python-keras/

In [7]:
'''The dataset is actually too small for LSTM to be of any advantage
compared to simpler, much faster methods such as TF-IDF + LogReg.
Notes:

- RNNs are tricky. Choice of batch size is important,
choice of loss and optimizer is critical, etc.
Some configurations won't converge.

- LSTM loss decrease patterns during training can be quite different
from what you see with CNNs/MLPs/etc.
'''
from tensorflow.keras.preprocessing import sequence
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Embedding, Dropout, Activation
from tensorflow.keras.layers import LSTM
from tensorflow.keras.layers import Conv1D
from tensorflow.keras.layers import MaxPooling1D
from tensorflow.keras.models import load_model

max_features = num_words # 20000
feature_len = 20 # avg_feature_len # cut texts after this number of words (among top max_features most common words)
batch_size = 32

print(len(X_train), 'train sequences')
print(len(X_test), 'test sequences')

print('Pad sequences (samples x time)')
X_train = sequence.pad_sequences(X_train, maxlen=feature_len)
X_test = sequence.pad_sequences(X_test, maxlen=feature_len)
print('X_train shape:', X_train.shape)
print('X_test shape:', X_test.shape)

num_classes = np.max(y_train) + 1
print(num_classes, 'classes')

print('Convert class vector to binary class matrix '
      '(for use with categorical_crossentropy)')
y_train = tf.keras.utils.to_categorical(y_train, num_classes)
y_test = tf.keras.utils.to_categorical(y_test, num_classes)
print('y_train shape:', y_train.shape)
print('y_test shape:', y_test.shape)

800000 train sequences
200000 test sequences
Pad sequences (samples x time)
X_train shape: (800000, 20)
X_test shape: (200000, 20)
4 classes
Convert class vector to binary class matrix (for use with categorical_crossentropy)
y_train shape: (800000, 4)
y_test shape: (200000, 4)


# Uncertainity Model

In [8]:
print('Build uncertainty model...')

#model_uncrtn = Sequential()
input_ = keras.layers.Input(shape=(feature_len))
layer_ = Embedding(num_words, 32) (input_)
layer_ = LSTM(128, dropout=0.2, recurrent_dropout=0.2)(layer_, training=True)
#layer_ = Dropout(0.25)(layer_, training=True)
output = Dense(num_classes, activation='softmax') (layer_)

model_uncrtn = keras.models.Model(inputs=input_, outputs=output)

# try using different optimizers and different optimizer configs
model_uncrtn.compile(loss='categorical_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])

print(model_uncrtn.summary())

Build uncertainty model...
Model: "model"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         [(None, 20)]              0         
_________________________________________________________________
embedding (Embedding)        (None, 20, 32)            37312     
_________________________________________________________________
lstm (LSTM)                  (None, 128)               82432     
_________________________________________________________________
dense (Dense)                (None, 4)                 516       
Total params: 120,260
Trainable params: 120,260
Non-trainable params: 0
_________________________________________________________________
None


In [9]:
print('Train...')
model_uncrtn.fit(X_train, y_train, batch_size=batch_size, epochs=EPOCHS,
          validation_split=0.1, verbose=1)
score, acc = model_uncrtn.evaluate(X_test, y_test,
                            batch_size=batch_size, verbose=1)
print('Test score:', score)
print('Test accuracy:', acc)

Train...
Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15

KeyboardInterrupt: 

## Save model

In [None]:
model_uncrtn.save('../dataverse_files/fl_voter_reg/lstm/fl_all_ln_lstm_uncrtn.h5')

In [None]:
words_df = pd.DataFrame(words_list, columns=['vocab'])
words_df.to_csv('../dataverse_files/fl_voter_reg/lstm/fl_all_ln_vocab.csv', index=False, encoding='utf-8')

In [None]:
predictions = []

ITER=100

for _ in range(ITER):
    predictions.append(model_uncrtn.predict(X_test, verbose=1))

In [None]:
predict_array = np.array(predictions)

In [None]:
# Creating a seperate array for each measurement
mean_arr = predict_array.mean(axis=0).reshape(-1,4)
std_arr = predict_array.std(axis=0).reshape(-1,4)
pct_5_arr = np.quantile(predict_array, .05, axis=0).reshape(-1,4)
pct_95_arr = np.quantile(predict_array, .95, axis=0).reshape(-1,4)

In [None]:
# Selecting the class based on teh mean value with the highest probability
final_pred_arr = mean_arr.argmax(axis=1)

In [None]:
final_pred_arr.shape

# Creating Final DataFrame

In [None]:
target_names = list(sdf.race.astype('category').cat.categories)

In [None]:
predict_df = pd.DataFrame(columns = ['pred', 'category' , 'pred_5', 'pred_95', 'pred_se'])

for i in range(len(final_pred_arr)):
    pred = final_pred_arr[i]
    category = target_names[pred]
    pred_5 = pct_5_arr[i,pred]
    pred_95 = pct_95_arr[i,pred]
    pred_se = std_arr[i,pred]
    
    predict_df = predict_df.append({'pred':pred, 
                                    'category': category,
                                    'pred_5':pred_5, 
                                    'pred_95': pred_95, 
                                    'pred_se':pred_se}, ignore_index=True)

In [None]:
predict_df

In [None]:
predict_df['category'].value_counts()

In [None]:
def predict_ci(name, num_iter=100, conf_int=0.9):
    
    NUM_CLASS = len(target_names)
    low_quantile = 0.5 - (conf_int/2)
    high_quantile = 0.5 + (conf_int/2)    
    predictions = []
    
    np_name = np.array(find_ngrams(name, NGRAMS)).reshape(1,-1)
    encoded_name = sequence.pad_sequences(np_name, maxlen=feature_len)
    
    for _ in range(num_iter):
        predictions.append(model_uncrtn.predict(encoded_name))
    
    predict_array = np.array(predictions)
    mean_arr = predict_array.mean(axis=0).reshape(-1,NUM_CLASS)
    std_arr = predict_array.std(axis=0).reshape(-1,NUM_CLASS)
    pct_low_arr = np.quantile(predict_array, low_quantile, axis=0).reshape(-1,NUM_CLASS)
    pct_high_arr = np.quantile(predict_array, high_quantile, axis=0).reshape(-1,NUM_CLASS)
    
    class_pred = mean_arr.argmax(axis=1)
    final_pred_class = target_names[class_pred[0]]
    final_conf_val = mean_arr[0, class_pred[0]]
    final_std_err = std_arr[0, class_pred[0]]
    final_low_pct = pct_low_arr[0, class_pred[0]]
    final_high_pct = pct_high_arr[0, class_pred[0]]
    return final_pred_class, final_conf_val, final_std_err, [final_low_pct, final_high_pct]

In [None]:
class_name, per_conf, std_err, conf_int = predict_ci("Wang")

In [None]:
class_name, per_conf, std_err, conf_int

In [None]:
class_name, per_conf, std_err, conf_int = predict_ci("McMahon")

In [None]:
class_name, per_conf, std_err, conf_int

In [None]:
class_name, per_conf, std_err, conf_int = predict_ci("Sood")

In [None]:
class_name, per_conf, std_err, conf_int

In [None]:
def predict_quant(name, num_iter=100, quant=[0.25,0.75]):
    
    NUM_CLASS = len(target_names)
    NUM_QUANT = len(quant)
    predictions = []
    
    np_name = np.array(find_ngrams(name, NGRAMS)).reshape(1,-1)
    encoded_name = sequence.pad_sequences(np_name, maxlen=feature_len)
    
    for _ in range(num_iter):
        predictions.append(model_uncrtn.predict(encoded_name))
    
    predict_array = np.array(predictions)
    mean_arr = predict_array.mean(axis=0).reshape(-1,NUM_CLASS)
    std_arr = predict_array.std(axis=0).reshape(-1,NUM_CLASS)
    quant = sorted(quant)
    quant_results = []
    for i, val in enumerate(quant):
        quant_results.append(np.quantile(predict_array, val, axis=0).reshape(-1,NUM_CLASS))

    quant_results = np.array(quant_results).reshape(-1,NUM_CLASS)
    class_pred = mean_arr.argmax(axis=1)
    final_pred_class = target_names[class_pred[0]]
    final_conf_val = mean_arr[0, class_pred[0]]
    final_std_err = std_arr[0, class_pred[0]]
    quant_final=[]
    for i in range(NUM_QUANT):
        quant_final.append(quant_results[i,class_pred[0]])
    return final_pred_class, final_conf_val, final_std_err, quant_final

In [None]:
class_name, per_conf, std_err, quant_res = predict_quant("Wang", num_iter=10, quant=[.25,.5,.75])

In [None]:
class_name, per_conf, std_err, quant_res

In [None]:
class_name, per_conf, std_err, quant_res = predict_quant("Hernandez", num_iter=10, quant=[.1,.4,.5,.5,.9])

In [None]:
class_name, per_conf, std_err, quant_res

In [None]:
class_name, per_conf, std_err, quant_res = predict_quant("Stewart", num_iter=10, quant=[.05,.95])

In [None]:
class_name, per_conf, std_err, quant_res