In [1]:
from __future__ import print_function

import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report,confusion_matrix

NGRAMS = 2
EPOCHS = 15
YEAR = '2000'
#YEAR = '2010'

df = pd.read_csv('./data/census/census_%s.csv' % YEAR)
df.dropna(subset=['name'], inplace=True)
df.replace('(S)', 0, inplace=True)
df

Unnamed: 0,name,rank,count,prop100k,cum_prop100k,pctwhite,pctblack,pctapi,pctaian,pct2prace,pcthispanic
0,SMITH,1,2376206,880.85,880.85,73.35,22.22,0.40,0.85,1.63,1.56
1,JOHNSON,2,1857160,688.44,1569.30,61.55,33.80,0.42,0.91,1.82,1.50
2,WILLIAMS,3,1534042,568.66,2137.96,48.52,46.72,0.37,0.78,2.01,1.60
3,BROWN,4,1380145,511.62,2649.58,60.71,34.54,0.41,0.83,1.86,1.64
4,JONES,5,1362755,505.17,3154.75,57.69,37.73,0.35,0.94,1.85,1.44
5,MILLER,6,1127803,418.07,3572.82,85.81,10.41,0.42,0.63,1.31,1.43
6,DAVIS,7,1072335,397.51,3970.33,64.73,30.77,0.40,0.79,1.73,1.58
7,GARCIA,8,858289,318.17,4288.50,6.17,0.49,1.43,0.58,0.51,90.81
8,RODRIGUEZ,9,804240,298.13,4586.62,5.52,0.54,0.58,0.24,0.41,92.70
9,WILSON,10,783051,290.27,4876.90,69.72,25.32,0.46,1.03,1.74,1.73


## Resampling with weight

In [2]:
sdf = df.sample(1000000, weights=df['count'], replace=True)
#sdf = df.sample(10000, weights=df['count'], replace=True)

## Assign race by pertcentage

In [3]:
from numpy.random import choice

races = ['white', 'black', 'api', 'hispanic']

def to_race(c):
    w = np.array(c).astype(float)
    probs = w/w.sum()
    return choice(races, p=probs)

sdf['race'] = sdf[['pctwhite', 'pctblack', 'pctapi', 'pcthispanic']].apply(lambda c: to_race(c), axis=1)
sdf

  import sys
  


Unnamed: 0,name,rank,count,prop100k,cum_prop100k,pctwhite,pctblack,pctapi,pctaian,pct2prace,pcthispanic,race
117541,ADAR,117538,137,0.05,88276.50,75.91,12.41,0,0.00,0,9.49,white
633,MELTON,634,48594,18.01,35257.05,82.29,13.59,0.41,0.94,1.42,1.36,white
155,DIXON,156,152015,56.35,20751.24,57.05,38.21,0.38,0.89,1.84,1.62,black
13,THOMAS,14,710696,263.45,5977.33,55.53,38.17,1.63,1.01,2.00,1.66,white
43,MITCHELL,44,367433,136.21,11661.38,63.55,31.52,0.39,0.98,1.93,1.63,white
114,WEST,115,188464,69.86,18205.59,77.69,17.80,0.39,0.81,1.64,1.66,white
6141,ZAJAC,6141,5135,1.90,62538.15,96.51,0,0.53,0,1.29,1.40,white
159,BLACK,160,150186,55.67,20974.72,76.65,18.56,0.42,1.35,1.49,1.53,white
21,LEE,22,605860,224.59,7925.40,40.09,17.41,37.83,1.03,2.30,1.34,black
13,THOMAS,14,710696,263.45,5977.33,55.53,38.17,1.63,1.01,2.00,1.66,black


## Check the correctness of race assignment

In [4]:
df[df.name == 'SMITH']

Unnamed: 0,name,rank,count,prop100k,cum_prop100k,pctwhite,pctblack,pctapi,pctaian,pct2prace,pcthispanic
0,SMITH,1,2376206,880.85,880.85,73.35,22.22,0.4,0.85,1.63,1.56


In [5]:
xdf = sdf[sdf.name=='SMITH'].groupby(['race']).agg({'name': 'count'})
xdf * 100 / xdf.sum()

Unnamed: 0_level_0,name
race,Unnamed: 1_level_1
api,0.452908
black,23.324756
hispanic,1.523417
white,74.698919


In [6]:
# Additional features
sdf['name_last'] = sdf.name.str.title()
sdf.groupby('race').agg({'name_last': 'count'})

Unnamed: 0_level_0,name_last
race,Unnamed: 1_level_1
api,35745
black,126533
hispanic,128100
white,709622


In [7]:
len(sdf)

1000000

## Preprocessing the input data

In [8]:
# only last name in Census data
sdf['name_last_name_first'] = sdf['name_last']

# build n-gram list
vect = CountVectorizer(analyzer='char', max_df=0.3, min_df=3, ngram_range=(NGRAMS, NGRAMS), lowercase=False) 
#vect = CountVectorizer(analyzer='char', ngram_range=(2, 2), lowercase=False) 
a = vect.fit_transform(sdf.name_last_name_first)
vocab = vect.vocabulary_
len(vocab)

948

In [9]:
import operator
sorted_vocab = sorted(vocab.items(), key=operator.itemgetter(1))
cols = list(map(operator.itemgetter(0), sorted_vocab))

In [10]:
count_df = pd.DataFrame(a.todense(), columns=cols)
count_df

Unnamed: 0,Aa,Ab,Ac,Ad,Ae,Af,Ag,Ah,Ai,Aj,...,zp,zq,zr,zs,zt,zu,zv,zw,zy,zz
0,0,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
8,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [11]:
count_df.sum().sort_values(ascending=False).describe()

count       948.000000
mean       5598.553797
std       12748.939305
min           3.000000
25%          86.750000
50%         966.500000
75%        5613.000000
max      175919.000000
dtype: float64

In [12]:
pd.set_option('display.max_rows', 20)
count_df.sum().sort_values(ascending=False)

er    175919
on    123841
an    106240
ar    100302
ll     87063
in     82920
le     73266
en     67266
el     61464
so     57063
       ...  
Fj         3
Mw         3
Dm         3
Hn         3
jp         3
Mb         3
Gf         3
Zl         3
xy         3
Cv         3
Length: 948, dtype: int64

In [13]:
# sort n-gram by freq (highest -> lowest)
words = []
for b in vocab:
    c = vocab[b]
    #print(b, c, a[:, c].sum())
    words.append((a[:, c].sum(), b))
    #break
words = sorted(words, reverse=True)
words_list = [w[1] for w in words]
num_words = len(words_list)
print("num_words = %d" % num_words)


def find_ngrams(text, n):
    a = zip(*[text[i:] for i in range(n)])
    wi = []
    for i in a:
        w = ''.join(i)
        try:
            idx = words_list.index(w)
        except:
            idx = 0
        wi.append(idx)
    return wi

# build X from index of n-gram sequence
X = np.array(sdf.name_last_name_first.apply(lambda c: find_ngrams(c, NGRAMS)))

# check max/avg feature
X_len = []
for x in X:
    X_len.append(len(x))

max_feature_len = max(X_len)
avg_feature_len = int(np.mean(X_len))

print("Max feature len = %d, Avg. feature len = %d" % (max_feature_len, avg_feature_len))
y = np.array(sdf.race.astype('category').cat.codes)

# Split train and test dataset
X_train,  X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=21, stratify=y)

num_words = 948
Max feature len = 14, Avg. feature len = 5


In [14]:
len(vocab)

948

## Train a LSTM model

ref: http://machinelearningmastery.com/sequence-classification-lstm-recurrent-neural-networks-python-keras/

In [15]:
'''The dataset is actually too small for LSTM to be of any advantage
compared to simpler, much faster methods such as TF-IDF + LogReg.
Notes:

- RNNs are tricky. Choice of batch size is important,
choice of loss and optimizer is critical, etc.
Some configurations won't converge.

- LSTM loss decrease patterns during training can be quite different
from what you see with CNNs/MLPs/etc.
'''
import keras
from keras.preprocessing import sequence
from keras.models import Sequential
from keras.layers import Dense, Embedding, Dropout, Activation
from keras.layers import LSTM
from keras.layers.convolutional import Conv1D
from keras.layers.convolutional import MaxPooling1D
from keras.models import load_model

max_features = num_words # 20000
feature_len = 20 # avg_feature_len # cut texts after this number of words (among top max_features most common words)
batch_size = 32

print(len(X_train), 'train sequences')
print(len(X_test), 'test sequences')

print('Pad sequences (samples x time)')
X_train = sequence.pad_sequences(X_train, maxlen=feature_len)
X_test = sequence.pad_sequences(X_test, maxlen=feature_len)
print('X_train shape:', X_train.shape)
print('X_test shape:', X_test.shape)

num_classes = np.max(y_train) + 1
print(num_classes, 'classes')

print('Convert class vector to binary class matrix '
      '(for use with categorical_crossentropy)')
y_train = keras.utils.to_categorical(y_train, num_classes)
y_test = keras.utils.to_categorical(y_test, num_classes)
print('y_train shape:', y_train.shape)
print('y_test shape:', y_test.shape)

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


800000 train sequences
200000 test sequences
Pad sequences (samples x time)
X_train shape: (800000, 20)
X_test shape: (200000, 20)
4 classes
Convert class vector to binary class matrix (for use with categorical_crossentropy)
y_train shape: (800000, 4)
y_test shape: (200000, 4)


In [16]:
print('Build model...')

model = Sequential()
model.add(Embedding(num_words, 32, input_length=feature_len))
model.add(LSTM(128, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(num_classes, activation='softmax'))

# try using different optimizers and different optimizer configs
model.compile(loss='categorical_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])

print(model.summary())

Build model...
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 20, 32)            30336     
_________________________________________________________________
lstm_1 (LSTM)                (None, 128)               82432     
_________________________________________________________________
dense_1 (Dense)              (None, 4)                 516       
Total params: 113,284
Trainable params: 113,284
Non-trainable params: 0
_________________________________________________________________
None


In [17]:
print('Train...')
model.fit(X_train, y_train, batch_size=batch_size, epochs=EPOCHS,
          validation_split=0.1, verbose=2)
score, acc = model.evaluate(X_test, y_test,
                            batch_size=batch_size, verbose=2)
print('Test score:', score)
print('Test accuracy:', acc)

Train...
Train on 720000 samples, validate on 80000 samples
Epoch 1/15
 - 1235s - loss: 0.5701 - acc: 0.8041 - val_loss: 0.5397 - val_acc: 0.8139
Epoch 2/15
 - 1419s - loss: 0.5325 - acc: 0.8160 - val_loss: 0.5197 - val_acc: 0.8198
Epoch 3/15
 - 1428s - loss: 0.5189 - acc: 0.8198 - val_loss: 0.5107 - val_acc: 0.8233
Epoch 4/15
 - 1424s - loss: 0.5113 - acc: 0.8220 - val_loss: 0.5045 - val_acc: 0.8236
Epoch 5/15
 - 1460s - loss: 0.5069 - acc: 0.8230 - val_loss: 0.5015 - val_acc: 0.8252
Epoch 6/15
 - 1710s - loss: 0.5037 - acc: 0.8238 - val_loss: 0.4997 - val_acc: 0.8252
Epoch 7/15
 - 1676s - loss: 0.5017 - acc: 0.8242 - val_loss: 0.4958 - val_acc: 0.8263
Epoch 8/15
 - 1678s - loss: 0.4999 - acc: 0.8248 - val_loss: 0.4964 - val_acc: 0.8269
Epoch 9/15
 - 1679s - loss: 0.4990 - acc: 0.8249 - val_loss: 0.4938 - val_acc: 0.8270
Epoch 10/15
 - 1677s - loss: 0.4980 - acc: 0.8251 - val_loss: 0.4948 - val_acc: 0.8265
Epoch 11/15
 - 1671s - loss: 0.4972 - acc: 0.8256 - val_loss: 0.4925 - val_acc:

## Confusion Matrix

In [18]:
y_pred = model.predict_classes(X_test, verbose=2)
p = model.predict_proba(X_test, verbose=2) # to predict probability
target_names = list(sdf.race.astype('category').cat.categories)
print(classification_report(np.argmax(y_test, axis=1), y_pred, target_names=target_names))
print(confusion_matrix(np.argmax(y_test, axis=1), y_pred))

             precision    recall  f1-score   support

        api       0.88      0.63      0.74      7149
      black       0.50      0.09      0.15     25307
   hispanic       0.86      0.84      0.85     25620
      white       0.83      0.96      0.89    141924

avg / total       0.79      0.83      0.79    200000

[[  4528     29    702   1890]
 [    86   2289    197  22735]
 [   117     79  21550   3874]
 [   387   2153   2520 136864]]


## Save model

In [19]:
model.save('./models/census/lstm/census%s_ln_lstm.h5' % YEAR)

In [20]:
words_df = pd.DataFrame(words_list, columns=['vocab'])
words_df.to_csv('./models/census/lstm/census%s_ln_vocab.csv' % YEAR, index=False, encoding='utf-8')