In [1]:
from __future__ import print_function

import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report,confusion_matrix

NGRAMS = 2
EPOCHS = 5
#YEAR = '2000'
YEAR = '2010'

df = pd.read_csv('./data/census/census_%s.csv' % YEAR)
df.dropna(subset=['name'], inplace=True)
df.replace('(S)', 0, inplace=True)
df

Unnamed: 0,name,rank,count,prop100k,cum_prop100k,pctwhite,pctblack,pctapi,pctaian,pct2prace,pcthispanic
0,SMITH,1,2442977,828.19,828.19,70.9,23.11,0.5,0.89,2.19,2.4
1,JOHNSON,2,1932812,655.24,1483.42,58.97,34.63,0.54,0.94,2.56,2.36
2,WILLIAMS,3,1625252,550.97,2034.39,45.75,47.68,0.46,0.82,2.81,2.49
3,BROWN,4,1437026,487.16,2521.56,57.95,35.6,0.51,0.87,2.55,2.52
4,JONES,5,1425470,483.24,3004.80,55.19,38.48,0.44,1,2.61,2.29
5,GARCIA,6,1166120,395.32,3400.12,5.38,0.45,1.41,0.47,0.26,92.03
6,MILLER,7,1161437,393.74,3793.86,84.11,10.76,0.54,0.66,1.77,2.17
7,DAVIS,8,1116357,378.45,4172.31,62.2,31.6,0.49,0.82,2.45,2.44
8,RODRIGUEZ,9,1094924,371.19,4543.50,4.75,0.54,0.57,0.18,0.18,93.77
9,MARTINEZ,10,1060159,359.40,4902.90,5.28,0.49,0.6,0.51,0.22,92.91


## Resampling with weight

In [2]:
sdf = df.sample(1000000, weights=df['count'], replace=True)

## Assign race by pertcentage

In [3]:
from numpy.random import choice

races = ['white', 'black', 'api', 'hispanic']

def to_race(c):
    w = np.array(c).astype(float)
    probs = w/w.sum()
    return choice(races, p=probs)

sdf['race'] = sdf[['pctwhite', 'pctblack', 'pctapi', 'pcthispanic']].apply(lambda c: to_race(c), axis=1)
sdf



Unnamed: 0,name,rank,count,prop100k,cum_prop100k,pctwhite,pctblack,pctapi,pctaian,pct2prace,pcthispanic,race
1712,ZARATE,1713,20992,7.12,47426.08,4.04,0.25,2.02,0.19,0.22,93.28,hispanic
15,THOMAS,16,756142,256.34,6632.26,52.63,38.75,2.44,1.07,2.58,2.54,white
1242,BELCHER,1243,28321,9.60,43555.09,80.09,15.12,0.44,0.52,1.68,2.15,black
162253,ALL OTHER NAMES,0,29312001,9936.97,9936.97,66.65,8.53,7.97,0.86,2.32,13.67,hispanic
46874,HIPPERT,46811,454,0.15,82195.91,91.41,1.76,0,0,3.08,2.64,white
7820,REVELES,7821,4233,1.44,65624.73,5.34,0.54,0.26,0.26,0.17,93.43,hispanic
292,PADILLA,293,105365,35.72,27142.99,6.15,0.4,2.1,0.79,0.41,90.16,hispanic
2037,CRAIN,2038,17650,5.98,49543.14,85.8,8.39,0.55,0.99,1.69,2.59,black
1002,BRAY,1003,34906,11.83,41003.86,81.76,12.26,0.6,0.84,2.21,2.33,white
3486,HIATT,3486,10247,3.47,56114.78,94.15,0.4,0.78,0.42,1.5,2.74,white


## Check the correctness of race assignment

In [4]:
df[df.name == 'SMITH']

Unnamed: 0,name,rank,count,prop100k,cum_prop100k,pctwhite,pctblack,pctapi,pctaian,pct2prace,pcthispanic
0,SMITH,1,2442977,828.19,828.19,70.9,23.11,0.5,0.89,2.19,2.4


In [5]:
xdf = sdf[sdf.name=='SMITH'].groupby(['race']).agg({'name': 'count'})
xdf * 100 / xdf.sum()

Unnamed: 0_level_0,name
race,Unnamed: 1_level_1
api,0.426641
black,24.306708
hispanic,2.536146
white,72.730505


In [6]:
# Additional features
sdf['name_last'] = sdf.name.str.title()
sdf.groupby('race').agg({'name_last': 'count'})

Unnamed: 0_level_0,name_last
race,Unnamed: 1_level_1
api,50572
black,124033
hispanic,164615
white,660780


In [7]:
len(sdf)

1000000

## Preprocessing the input data

In [8]:
# only last name in Census data
sdf['name_last_name_first'] = sdf['name_last']

# build n-gram list
vect = CountVectorizer(analyzer='char', max_df=0.3, min_df=3, ngram_range=(NGRAMS, NGRAMS), lowercase=False) 
#vect = CountVectorizer(analyzer='char', ngram_range=(2, 2), lowercase=False) 
a = vect.fit_transform(sdf.name_last_name_first)
vocab = vect.vocabulary_
len(vocab)

978

In [9]:
import operator
sorted_vocab = sorted(vocab.items(), key=operator.itemgetter(1))
cols = list(map(operator.itemgetter(0), sorted_vocab))

In [10]:
count_df = pd.DataFrame(a.todense(), columns=cols)
count_df

Unnamed: 0,N,O,Aa,Ab,Ac,Ad,Ae,Af,Ag,Ah,...,zp,zq,zr,zs,zt,zu,zv,zw,zy,zz
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,1,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
8,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [11]:
count_df.sum().sort_values(ascending=False).describe()

count       978.000000
mean       6316.878323
std       18202.873651
min           3.000000
25%          77.250000
50%         824.000000
75%        4998.500000
max      254497.000000
dtype: float64

In [12]:
pd.set_option('display.max_rows', 20)
count_df.sum().sort_values(ascending=False)

er    254497
ll    174721
es    144434
am    132210
he    124637
th    119443
me    115399
on    108631
Al    108494
Na    102842
       ...  
kg         3
Hn         3
xp         3
xy         3
Wd         3
Zr         3
mv         3
kx         3
Zg         3
Tc         3
dtype: int64

In [13]:
# sort n-gram by freq (highest -> lowest)
words = []
for b in vocab:
    c = vocab[b]
    #print(b, c, a[:, c].sum())
    words.append((a[:, c].sum(), b))
    #break
words = sorted(words, reverse=True)
words_list = [w[1] for w in words]
num_words = len(words_list)
print("num_words = %d" % num_words)

num_words = 978


In [14]:
def find_ngrams(text, n):
    a = zip(*[text[i:] for i in range(n)])
    wi = []
    for i in a:
        w = ''.join(i)
        try:
            idx = words_list.index(w)
        except:
            idx = 0
        wi.append(idx)
    return wi

# build X from index of n-gram sequence
X = np.array(sdf.name_last_name_first.apply(lambda c: find_ngrams(c, NGRAMS)))

# check max/avg feature
X_len = []
for x in X:
    X_len.append(len(x))

max_feature_len = max(X_len)
avg_feature_len = int(np.mean(X_len))

print("Max feature len = %d, Avg. feature len = %d" % (max_feature_len, avg_feature_len))
y = np.array(sdf.race.astype('category').cat.codes)

# Split train and test dataset
X_train,  X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=21, stratify=y)

Max feature len = 14, Avg. feature len = 6


## Train a LSTM model

ref: http://machinelearningmastery.com/sequence-classification-lstm-recurrent-neural-networks-python-keras/

In [15]:
'''The dataset is actually too small for LSTM to be of any advantage
compared to simpler, much faster methods such as TF-IDF + LogReg.
Notes:

- RNNs are tricky. Choice of batch size is important,
choice of loss and optimizer is critical, etc.
Some configurations won't converge.

- LSTM loss decrease patterns during training can be quite different
from what you see with CNNs/MLPs/etc.
'''
import keras
from keras.preprocessing import sequence
from keras.models import Sequential
from keras.layers import Dense, Embedding, Dropout, Activation
from keras.layers import LSTM
from keras.layers.convolutional import Conv1D
from keras.layers.convolutional import MaxPooling1D
from keras.models import load_model

max_features = num_words # 20000
feature_len = 20 # avg_feature_len # cut texts after this number of words (among top max_features most common words)
batch_size = 32

print(len(X_train), 'train sequences')
print(len(X_test), 'test sequences')

print('Pad sequences (samples x time)')
X_train = sequence.pad_sequences(X_train, maxlen=feature_len)
X_test = sequence.pad_sequences(X_test, maxlen=feature_len)
print('X_train shape:', X_train.shape)
print('X_test shape:', X_test.shape)

num_classes = np.max(y_train) + 1
print(num_classes, 'classes')

print('Convert class vector to binary class matrix '
      '(for use with categorical_crossentropy)')
y_train = keras.utils.to_categorical(y_train, num_classes)
y_test = keras.utils.to_categorical(y_test, num_classes)
print('y_train shape:', y_train.shape)
print('y_test shape:', y_test.shape)

Using TensorFlow backend.


800000 train sequences
200000 test sequences
Pad sequences (samples x time)
X_train shape: (800000, 20)
X_test shape: (200000, 20)
4 classes
Convert class vector to binary class matrix (for use with categorical_crossentropy)
y_train shape: (800000, 4)
y_test shape: (200000, 4)


In [16]:
print('Build model...')

model = Sequential()
model.add(Embedding(num_words, 32, input_length=feature_len))
model.add(LSTM(128, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(num_classes, activation='sigmoid'))

# try using different optimizers and different optimizer configs
model.compile(loss='categorical_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])

print(model.summary())

Build model...
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 20, 32)            31296     
_________________________________________________________________
lstm_1 (LSTM)                (None, 128)               82432     
_________________________________________________________________
dense_1 (Dense)              (None, 4)                 516       
Total params: 114,244.0
Trainable params: 114,244
Non-trainable params: 0.0
_________________________________________________________________
None


In [17]:
print('Train...')
model.fit(X_train, y_train, batch_size=batch_size, epochs=EPOCHS,
          validation_split=0.1, verbose=2)
score, acc = model.evaluate(X_test, y_test,
                            batch_size=batch_size, verbose=2)
print('Test score:', score)
print('Test accuracy:', acc)

Train...
Train on 720000 samples, validate on 80000 samples
Epoch 1/5
1361s - loss: 0.6619 - acc: 0.7737 - val_loss: 0.6213 - val_acc: 0.7861
Epoch 2/5
1235s - loss: 0.6224 - acc: 0.7867 - val_loss: 0.6064 - val_acc: 0.7917
Epoch 3/5
1215s - loss: 0.6098 - acc: 0.7905 - val_loss: 0.5976 - val_acc: 0.7943
Epoch 4/5
1256s - loss: 0.6026 - acc: 0.7928 - val_loss: 0.5907 - val_acc: 0.7960
Epoch 5/5
1243s - loss: 0.5978 - acc: 0.7942 - val_loss: 0.5872 - val_acc: 0.7964
Test score: 0.588502460248
Test accuracy: 0.79689


## Confusion Matrix

In [18]:
y_pred = model.predict_classes(X_test, verbose=2)
p = model.predict_proba(X_test, verbose=2) # to predict probability
target_names = list(sdf.race.astype('category').cat.categories)
print(classification_report(np.argmax(y_test, axis=1), y_pred, target_names=target_names))
print(confusion_matrix(np.argmax(y_test, axis=1), y_pred))

             precision    recall  f1-score   support

        api       0.82      0.56      0.66     10114
      black       0.64      0.02      0.04     24807
   hispanic       0.87      0.76      0.81     32923
      white       0.78      0.97      0.87    132156

avg / total       0.78      0.80      0.74    200000

[[  5614     24    788   3688]
 [   224    507    238  23838]
 [   164     18  25013   7728]
 [   815    239   2858 128244]]


## Save model

In [19]:
model.save('./models/census/lstm/census%s_ln_lstm.h5' % YEAR)

In [20]:
words_df = pd.DataFrame(words_list, columns=['vocab'])
words_df.to_csv('./models/census/lstm/census%s_ln_vocab.csv' % YEAR, index=False, encoding='utf-8')

In [21]:
y_pred = model.predict_classes(X_train, verbose=2)
target_names = list(sdf.race.astype('category').cat.categories)
print(classification_report(np.argmax(y_train, axis=1), y_pred, target_names=target_names))
print(confusion_matrix(np.argmax(y_train, axis=1), y_pred))

             precision    recall  f1-score   support

        api       0.84      0.56      0.67     40458
      black       0.62      0.02      0.04     99226
   hispanic       0.87      0.76      0.81    131692
      white       0.79      0.97      0.87    528624

avg / total       0.78      0.80      0.75    800000

[[ 22564     81   3256  14557]
 [   887   1974   1078  95287]
 [   672     90 100392  30538]
 [  2837   1043  11080 513664]]
