In [1]:
import keras
import tensorflow as tf
print(keras.__version__)
print(tf.__version__)

2.5.0
2.5.0


In [2]:
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report,confusion_matrix

NGRAMS = 2
EPOCHS = 15
#YEAR = '2000'
YEAR = '2010'

df = pd.read_csv('../data/census/census_%s.csv' % YEAR)
df.dropna(subset=['name'], inplace=True)
df.replace('(S)', 0, inplace=True)
df

Unnamed: 0,name,rank,count,prop100k,cum_prop100k,pctwhite,pctblack,pctapi,pctaian,pct2prace,pcthispanic
0,SMITH,1,2442977,828.19,828.19,70.9,23.11,0.5,0.89,2.19,2.4
1,JOHNSON,2,1932812,655.24,1483.42,58.97,34.63,0.54,0.94,2.56,2.36
2,WILLIAMS,3,1625252,550.97,2034.39,45.75,47.68,0.46,0.82,2.81,2.49
3,BROWN,4,1437026,487.16,2521.56,57.95,35.6,0.51,0.87,2.55,2.52
4,JONES,5,1425470,483.24,3004.80,55.19,38.48,0.44,1,2.61,2.29
...,...,...,...,...,...,...,...,...,...,...,...
162249,DIETZMANN,160975,100,0.03,90062.93,96,0,0,0,0,0
162250,DOKAS,160975,100,0.03,90062.96,94,0,0,0,0,0
162251,DONLEA,160975,100,0.03,90062.99,94,0,0,0,0,6
162252,DORIOTT,160975,100,0.03,90063.03,89,0,0,0,5,0


## Resampling with weight

In [3]:
sdf = df.sample(1000000, weights=df['count'], replace=True)

## Assign race by pertcentage

In [4]:
from numpy.random import choice

races = ['white', 'black', 'api', 'hispanic']

def to_race(c):
    w = np.array(c).astype(float)
    if w.sum() == 0:
        return 'white'
    probs = w/w.sum()
    return choice(races, p=probs)

sdf['race'] = sdf[['pctwhite', 'pctblack', 'pctapi', 'pcthispanic']].apply(lambda c: to_race(c), axis=1)
sdf

Unnamed: 0,name,rank,count,prop100k,cum_prop100k,pctwhite,pctblack,pctapi,pctaian,pct2prace,pcthispanic,race
9776,PETRICK,9777,3315,1.12,68108.85,95.44,0.48,0.66,0.27,1.09,2.05,white
1762,CARLISLE,1763,20365,6.90,47775.60,79.31,15.07,0.53,0.8,2.16,2.14,white
28341,FARNAM,28333,843,0.29,78366.15,94.42,0,1.42,0,1.19,1.9,white
244,HARPER,245,124461,42.19,25289.88,67.9,26.22,0.49,0.72,2.36,2.31,white
1,JOHNSON,2,1932812,655.24,1483.42,58.97,34.63,0.54,0.94,2.56,2.36,hispanic
...,...,...,...,...,...,...,...,...,...,...,...,...
1,JOHNSON,2,1932812,655.24,1483.42,58.97,34.63,0.54,0.94,2.56,2.36,white
219,RAY,220,133171,45.15,24197.50,74.97,17.54,1.94,0.93,2.05,2.56,white
23376,PAVLOVIC,23371,1089,0.37,76757.15,95.78,0,0,0,1.1,2.85,white
548,DRAKE,549,61162,20.73,33986.96,79.55,14.92,0.52,0.62,2.04,2.34,white


## Check the correctness of race assignment

In [5]:
df[df.name == 'SMITH']

Unnamed: 0,name,rank,count,prop100k,cum_prop100k,pctwhite,pctblack,pctapi,pctaian,pct2prace,pcthispanic
0,SMITH,1,2442977,828.19,828.19,70.9,23.11,0.5,0.89,2.19,2.4


In [6]:
xdf = sdf[sdf.name=='SMITH'].groupby(['race']).agg({'name': 'count'})
xdf * 100 / xdf.sum()

Unnamed: 0_level_0,name
race,Unnamed: 1_level_1
api,0.531594
black,23.716322
hispanic,2.694213
white,73.057871


In [7]:
# Additional features
sdf['name_last'] = sdf.name.str.title()
sdf.groupby('race').agg({'name_last': 'count'})

Unnamed: 0_level_0,name_last
race,Unnamed: 1_level_1
api,50502
black,123917
hispanic,163911
white,661670


In [8]:
len(sdf)

1000000

## Preprocessing the input data

In [9]:
# only last name in Census data
sdf['name_last_name_first'] = sdf['name_last']

# build n-gram list
vect = CountVectorizer(analyzer='char', max_df=0.3, min_df=3, ngram_range=(NGRAMS, NGRAMS), lowercase=False) 
#vect = CountVectorizer(analyzer='char', ngram_range=(2, 2), lowercase=False) 
a = vect.fit_transform(sdf.name_last_name_first)
vocab = vect.vocabulary_
len(vocab)

972

In [10]:
import operator
sorted_vocab = sorted(vocab.items(), key=operator.itemgetter(1))
cols = list(map(operator.itemgetter(0), sorted_vocab))

In [11]:
count_df = pd.DataFrame(a.todense(), columns=cols)
count_df

Unnamed: 0,N,O,Aa,Ab,Ac,Ad,Ae,Af,Ag,Ah,...,zp,zq,zr,zs,zt,zu,zv,zw,zy,zz
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
999995,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
999996,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
999997,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
999998,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [12]:
count_df.sum().sort_values(ascending=False).describe()

count       972.000000
mean       6355.089506
std       18222.358934
min           3.000000
25%          78.750000
50%         864.000000
75%        5045.750000
max      253806.000000
dtype: float64

In [13]:
pd.set_option('display.max_rows', 20)
count_df.sum().sort_values(ascending=False)

er    253806
ll    174755
es    143920
am    131728
he    124202
       ...  
Zg         3
xk         3
gv         3
Lw         3
nx         3
Length: 972, dtype: int64

In [14]:
# sort n-gram by freq (highest -> lowest)
words = []
for b in vocab:
    c = vocab[b]
    #print(b, c, a[:, c].sum())
    words.append((a[:, c].sum(), b))
    #break
words = sorted(words, reverse=True)
words_list = [w[1] for w in words]
num_words = len(words_list)
print("num_words = %d" % num_words)

num_words = 972


In [15]:
def find_ngrams(text, n):
    a = zip(*[text[i:] for i in range(n)])
    wi = []
    for i in a:
        w = ''.join(i)
        try:
            idx = words_list.index(w)
        except:
            idx = 0
        wi.append(idx)
    return wi

# build X from index of n-gram sequence
X = np.array(sdf.name_last_name_first.apply(lambda c: find_ngrams(c, NGRAMS)))

# check max/avg feature
X_len = []
for x in X:
    X_len.append(len(x))

max_feature_len = max(X_len)
avg_feature_len = int(np.mean(X_len))

print("Max feature len = %d, Avg. feature len = %d" % (max_feature_len, avg_feature_len))
y = np.array(sdf.race.astype('category').cat.codes)

# Split train and test dataset
X_train,  X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=21, stratify=y)

Max feature len = 14, Avg. feature len = 6


## Train a LSTM model

ref: http://machinelearningmastery.com/sequence-classification-lstm-recurrent-neural-networks-python-keras/

In [16]:
'''The dataset is actually too small for LSTM to be of any advantage
compared to simpler, much faster methods such as TF-IDF + LogReg.
Notes:

- RNNs are tricky. Choice of batch size is important,
choice of loss and optimizer is critical, etc.
Some configurations won't converge.

- LSTM loss decrease patterns during training can be quite different
from what you see with CNNs/MLPs/etc.
'''
from keras.preprocessing import sequence
from keras.models import Sequential
from keras.layers import Dense, Embedding, Dropout, Activation
from keras.layers import LSTM
from keras.layers.convolutional import Conv1D
from keras.layers.convolutional import MaxPooling1D
from keras.models import load_model

max_features = num_words # 20000
feature_len = 20 # avg_feature_len # cut texts after this number of words (among top max_features most common words)
batch_size = 32

print(len(X_train), 'train sequences')
print(len(X_test), 'test sequences')

print('Pad sequences (samples x time)')
X_train = sequence.pad_sequences(X_train, maxlen=feature_len)
X_test = sequence.pad_sequences(X_test, maxlen=feature_len)
print('X_train shape:', X_train.shape)
print('X_test shape:', X_test.shape)

num_classes = np.max(y_train) + 1
print(num_classes, 'classes')

print('Convert class vector to binary class matrix '
      '(for use with categorical_crossentropy)')
y_train = tf.keras.utils.to_categorical(y_train, num_classes)
y_test = tf.keras.utils.to_categorical(y_test, num_classes)
print('y_train shape:', y_train.shape)
print('y_test shape:', y_test.shape)

800000 train sequences
200000 test sequences
Pad sequences (samples x time)
X_train shape: (800000, 20)
X_test shape: (200000, 20)
4 classes
Convert class vector to binary class matrix (for use with categorical_crossentropy)
y_train shape: (800000, 4)
y_test shape: (200000, 4)


In [17]:
print('Build model...')

model = Sequential()
model.add(Embedding(num_words, 32, input_length=feature_len))
model.add(LSTM(128, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(num_classes, activation='softmax'))

# try using different optimizers and different optimizer configs
model.compile(loss='categorical_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])

print(model.summary())

Build model...
Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 20, 32)            31104     
_________________________________________________________________
lstm (LSTM)                  (None, 128)               82432     
_________________________________________________________________
dense (Dense)                (None, 4)                 516       
Total params: 114,052
Trainable params: 114,052
Non-trainable params: 0
_________________________________________________________________
None


In [18]:
print('Train...')
model.fit(X_train, y_train, batch_size=batch_size, epochs=EPOCHS,
          validation_split=0.1, verbose=1)
score, acc = model.evaluate(X_test, y_test,
                            batch_size=batch_size, verbose=1)
print('Test score:', score)
print('Test accuracy:', acc)

Train...
Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15
Test score: 0.5722097754478455
Test accuracy: 0.802590012550354


## Confusion Matrix

In [19]:
p = model.predict(X_test, verbose=2) # to predict probability
y_pred = np.argmax(p, axis=-1)
target_names = list(sdf.race.astype('category').cat.categories)
print(classification_report(np.argmax(y_test, axis=1), y_pred, target_names=target_names))
print(confusion_matrix(np.argmax(y_test, axis=1), y_pred))

6250/6250 - 30s
              precision    recall  f1-score   support

         api       0.85      0.57      0.68     10101
       black       0.57      0.07      0.12     24783
    hispanic       0.88      0.77      0.82     32782
       white       0.79      0.97      0.87    132334

    accuracy                           0.80    200000
   macro avg       0.77      0.59      0.62    200000
weighted avg       0.78      0.80      0.76    200000

[[  5728     59    770   3544]
 [   179   1629    219  22756]
 [   170     75  25127   7410]
 [   671   1082   2547 128034]]


## Save model

In [20]:
model.save('./census/lstm/census%s_ln_lstm.h5' % YEAR)

In [21]:
words_df = pd.DataFrame(words_list, columns=['vocab'])
words_df.to_csv('./census/lstm/census%s_ln_vocab.csv' % YEAR, index=False, encoding='utf-8')

In [22]:
y_pred = model.predict_classes(X_train, verbose=2)
target_names = list(sdf.race.astype('category').cat.categories)
print(classification_report(np.argmax(y_train, axis=1), y_pred, target_names=target_names))
print(confusion_matrix(np.argmax(y_train, axis=1), y_pred))



25000/25000 - 116s
              precision    recall  f1-score   support

         api       0.86      0.58      0.69     40401
       black       0.58      0.07      0.12     99134
    hispanic       0.88      0.77      0.82    131129
       white       0.79      0.97      0.87    529336

    accuracy                           0.80    800000
   macro avg       0.78      0.60      0.63    800000
weighted avg       0.78      0.80      0.76    800000

[[ 23427    278   3129  13567]
 [   729   6844    920  90641]
 [   584    307 101125  29113]
 [  2354   4373  10034 512575]]


In [23]:
xdf.to_csv('./census/lstm/census%s_race.csv' % YEAR, columns=[])