In [1]:
import numpy as np
import pandas as pd
import os

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report,confusion_matrix

from tensorflow.keras.preprocessing import sequence
from tensorflow.keras.models import Sequential, load_model
from tensorflow.keras.layers import Dense, Embedding, Dropout, Activation, LSTM, Conv1D, MaxPooling1D

import warnings
warnings.filterwarnings('ignore')

In [2]:
df = pd.read_csv('census_2010.csv')
df.shape

(162254, 11)

In [3]:
df.head()

Unnamed: 0,name,rank,count,prop100k,cum_prop100k,pctwhite,pctblack,pctapi,pctaian,pct2prace,pcthispanic
0,SMITH,1,2442977,828.19,828.19,70.9,23.11,0.5,0.89,2.19,2.4
1,JOHNSON,2,1932812,655.24,1483.42,58.97,34.63,0.54,0.94,2.56,2.36
2,WILLIAMS,3,1625252,550.97,2034.39,45.75,47.68,0.46,0.82,2.81,2.49
3,BROWN,4,1437026,487.16,2521.56,57.95,35.6,0.51,0.87,2.55,2.52
4,JONES,5,1425470,483.24,3004.8,55.19,38.48,0.44,1.0,2.61,2.29


In [4]:
df.tail()

Unnamed: 0,name,rank,count,prop100k,cum_prop100k,pctwhite,pctblack,pctapi,pctaian,pct2prace,pcthispanic
162249,DIETZMANN,160975,100,0.03,90062.93,96.0,0,0,(S),0,(S)
162250,DOKAS,160975,100,0.03,90062.96,94.0,(S),0,0,(S),(S)
162251,DONLEA,160975,100,0.03,90062.99,94.0,0,0,0,0,6
162252,DORIOTT,160975,100,0.03,90063.03,89.0,0,(S),0,5,(S)
162253,ALL OTHER NAMES,0,29312001,9936.97,9936.97,66.65,8.53,7.97,0.86,2.32,13.67


In [5]:
# remove the "ALL OTHER NAMES" row

df = df.iloc[:-1,:]
df.shape

(162253, 11)

In [6]:
df.tail()

Unnamed: 0,name,rank,count,prop100k,cum_prop100k,pctwhite,pctblack,pctapi,pctaian,pct2prace,pcthispanic
162248,DOBBEN,160975,100,0.03,90062.89,98,0,(S),0,0,(S)
162249,DIETZMANN,160975,100,0.03,90062.93,96,0,0,(S),0,(S)
162250,DOKAS,160975,100,0.03,90062.96,94,(S),0,0,(S),(S)
162251,DONLEA,160975,100,0.03,90062.99,94,0,0,0,0,6
162252,DORIOTT,160975,100,0.03,90063.03,89,0,(S),0,5,(S)


## Check NaN and drop and sort dataframe

In [7]:
df.isna().sum()

name            1
rank            0
count           0
prop100k        0
cum_prop100k    0
pctwhite        0
pctblack        0
pctapi          0
pctaian         0
pct2prace       0
pcthispanic     0
dtype: int64

In [8]:
df.name[df.name.isna()]

4909    NaN
Name: name, dtype: object

In [9]:
df.iloc[4909]

name                     NaN
rank                    4910
count                   7170
prop100k            2.430000
cum_prop100k    60231.650000
pctwhite               93.58
pctblack                2.09
pctapi                  0.56
pctaian                 0.43
pct2prace               1.35
pcthispanic             1.98
Name: 4909, dtype: object

In [10]:
# Run just once

df.dropna(inplace = True)

In [11]:
df.shape

(162252, 11)

In [12]:
# Fields suppressed for confidentiality are assigned (S) and we replace it with 0

df.sort_values(by = 'name').head()

Unnamed: 0,name,rank,count,prop100k,cum_prop100k,pctwhite,pctblack,pctapi,pctaian,pct2prace,pcthispanic
129009,AAB,128249,133,0.05,88770.96,87.97,(S),(S),0,6.77,(S)
45641,AABERG,45569,469,0.16,82003.18,95.1,0,(S),(S),2.56,1.28
85051,AABY,85049,220,0.07,86239.41,98.18,(S),(S),0,0.0,(S)
54997,AADLAND,54990,374,0.13,83329.35,87.97,(S),(S),3.48,5.08,2.41
124575,AAFEDT,124548,138,0.05,88567.34,91.3,(S),(S),0,5.07,(S)


In [13]:
df1 = df.sort_values(by = 'name', ignore_index= True)
df1.tail()

Unnamed: 0,name,rank,count,prop100k,cum_prop100k,pctwhite,pctblack,pctapi,pctaian,pct2prace,pcthispanic
162247,ZYSK,35764,628,0.21,80190.96,98.73,0,(S),(S),0,(S)
162248,ZYSKOWSKI,39887,550,0.19,81012.75,96.55,(S),(S),0,1.64,1.27
162249,ZYSMAN,131379,129,0.04,88911.84,94.57,(S),0,0,(S),(S)
162250,ZYWICKI,39419,558,0.19,80925.32,95.52,(S),(S),0,1.25,2.33
162251,ZYWIEC,102688,175,0.06,87442.24,98.29,(S),0,0,0,(S)


In [14]:
# Run just once

df1.replace({'(S)':0}, inplace = True)

In [15]:
df1.tail()

Unnamed: 0,name,rank,count,prop100k,cum_prop100k,pctwhite,pctblack,pctapi,pctaian,pct2prace,pcthispanic
162247,ZYSK,35764,628,0.21,80190.96,98.73,0,0,0,0.0,0.0
162248,ZYSKOWSKI,39887,550,0.19,81012.75,96.55,0,0,0,1.64,1.27
162249,ZYSMAN,131379,129,0.04,88911.84,94.57,0,0,0,0.0,0.0
162250,ZYWICKI,39419,558,0.19,80925.32,95.52,0,0,0,1.25,2.33
162251,ZYWIEC,102688,175,0.06,87442.24,98.29,0,0,0,0.0,0.0


## Preprocess data

* For every name we take n-grams (2) Ex: 'like' = ['li', 'ik', 'ke']
* We then tokenize based on frequency.

In [16]:
NGRAMS = 2

# lower case false so that the name  = sample1.name.str.title()
# we get new name in capitalize so that lstm knows if 'A' is there then it is starting of name instead of 'a'
# build n-gram list - char level
vect = CountVectorizer(analyzer='char', min_df=3, ngram_range=(NGRAMS, NGRAMS), lowercase=False) 


In [17]:
# transform to 'name' feature with capitalize of starting letter

name_vect = vect.fit_transform(df1.name.str.title())

In [18]:
vocab = vect.vocabulary_
len(vocab)

977

In [19]:
print(list(vocab.items())[:10])

#print(vocab)

[('Aa', 0), ('ab', 371), ('be', 400), ('er', 486), ('rg', 771), ('by', 417), ('ad', 373), ('dl', 455), ('la', 636), ('an', 383)]


In [20]:
name_vect

<162252x977 sparse matrix of type '<class 'numpy.int64'>'
	with 959429 stored elements in Compressed Sparse Row format>

In [21]:
# sort the vocab (we ger )

sort_vocab = {k: v for k, v in sorted(vocab.items(), key=lambda item: item[1])}

In [22]:
print(list(sort_vocab.items())[:10],'\n')
print(list(sort_vocab.items())[-10:])

[('Aa', 0), ('Ab', 1), ('Ac', 2), ('Ad', 3), ('Ae', 4), ('Af', 5), ('Ag', 6), ('Ah', 7), ('Ai', 8), ('Aj', 9)] 

[('zp', 967), ('zq', 968), ('zr', 969), ('zs', 970), ('zt', 971), ('zu', 972), ('zv', 973), ('zw', 974), ('zy', 975), ('zz', 976)]


In [23]:
count_df = pd.DataFrame(name_vect.todense(), columns= sort_vocab)
count_df.shape

(162252, 977)

In [24]:
# Ex: how many rows with 'er' ...

count_df.sum().sort_values(ascending = False)

er    29110
an    20941
in    15020
ar    14735
en    13545
      ...  
Ij        3
vg        3
Gs        3
bp        3
Yz        3
Length: 977, dtype: int64

In [25]:
two_grams = list(dict( count_df.sum().sort_values(ascending = False)))

In [26]:
two_grams[:10], two_grams[-10:], 

(['er', 'an', 'in', 'ar', 'en', 'el', 'le', 'on', 'll', 'ch'],
 ['Dc', 'tq', 'Yt', 'xv', 'fq', 'Ij', 'vg', 'Gs', 'bp', 'Yz'])

## Char Tokenize

* Need to assign the value of two_grams to each name in df
* EX:  ['st', 'tu', 'ud', 'de', 'en', 'nt'] = [46, 357, 272, 31, 7, 55]


In [27]:
def find_ngrams(text):
    t = [text[i:i+2] for i in range(len(text)-1)]
    wi = []
    for i in t:
        w = ''.join(i)
        #print(w)
        try:
            idx = two_grams.index(w)
        except:
            idx = 0
        wi.append(idx)
    return wi

## X , Y data

In [28]:
X = np.array([find_ngrams(x) for x in list(df1.name.str.title())])

In [29]:
X.shape

(162252,)

In [30]:
X

array([list([710, 133]), list([710, 133, 46, 0, 92]),
       list([710, 133, 378]), ..., list([770, 384, 302, 20, 1]),
       list([770, 685, 201, 24, 31, 51]), list([770, 685, 201, 22, 103])],
      dtype=object)

### Min, Max features

In [31]:
max_features = max([len(i) for i in X])
min_features = min([len(i) for i in X])
avg_features = np.mean([len(i) for i in X])

print('max_features: ',max_features)
print('min_features: ',min_features)
print('avg_features: ',avg_features)

max_features:  14
min_features:  1
avg_features:  5.95234573379681


In [32]:
np.argmax([len(i) for i in X]), np.argmin([len(i) for i in X])


(3502, 45)

In [33]:
X[np.argmax([len(i) for i in X])], df1.name.iloc[np.argmax([len(i) for i in X])], X[np.argmin([len(i) for i in X])], df1.name.iloc[np.argmin([len(i) for i in X])]

([308, 44, 83, 336, 59, 53, 34, 47, 198, 278, 68, 89, 43, 53],
 'ANAGNOSTOPOULOS',
 [364],
 'AB')

### Y label

In [34]:
race = df1.iloc[:, 5:].astype('float').idxmax(axis=1)

In [35]:
len(race)

162252

In [36]:
race.unique()

array(['pctwhite', 'pctapi', 'pctblack', 'pcthispanic', 'pctaian',
       'pct2prace'], dtype=object)

In [37]:
df1['race'] = race 

In [38]:
df1.race.value_counts()

pctwhite       133214
pcthispanic     11158
pctapi           9241
pctblack         7828
pctaian           688
pct2prace         123
Name: race, dtype: int64

In [39]:
df1.race.replace({'pctwhite':'white', 
                  'pctapi':'asian', 
                  'pctblack':'black', 
                  'pcthispanic':'hispanic', 
                  'pctaian':'nativeamerican',
                  'pct2prace': 'morethanone',
                 }, inplace = True)

In [40]:
df1.race.value_counts()

white             133214
hispanic           11158
asian               9241
black               7828
nativeamerican       688
morethanone          123
Name: race, dtype: int64

In [41]:
y = df1.race.astype('category').cat.codes

In [42]:
y.value_counts()

5    133214
2     11158
0      9241
1      7828
4       688
3       123
dtype: int64

 * This is the order ['asian', 'black', 'hispanic', 'morethanone', 'nativeamerican', 'white']

In [43]:
# Split train and test dataset
X_train,  X_test, y_train, y_test = train_test_split(X, np.array(y), test_size=0.001, random_state= 19, stratify= np.array(y))

In [44]:
X_train.shape, X_test.shape,

((162089,), (163,))

In [45]:
y_train[:5]

array([5, 5, 5, 5, 5], dtype=int8)

## LSTM model

In [46]:
X_train = sequence.pad_sequences(X_train, maxlen= max_features, padding='post')
X_test = sequence.pad_sequences(X_test, maxlen= max_features, padding='post')

#X_train = sequence.pad_sequences(X_train, maxlen= max_features)
#X_test = sequence.pad_sequences(X_test, maxlen= max_features)
print('X_train shape:', X_train.shape)
print('X_test shape:', X_test.shape)

X_train shape: (162089, 14)
X_test shape: (163, 14)


In [47]:
X_train[:2], len(np.unique(y))

(array([[ 78,  40, 229, 430,   0,   0,   0,   0,   0,   0,   0,   0,   0,
           0],
        [363, 387,  26, 341, 838,   0,   0,   0,   0,   0,   0,   0,   0,
           0]], dtype=int32),
 6)

In [48]:
model = Sequential()
model.add(Embedding(input_dim= len(vocab), output_dim= 32, input_length= max_features))
model.add(LSTM(16, dropout= 0.2, recurrent_dropout= 0.2))
model.add(Dense(len(np.unique(y)), activation='softmax'))

# try using different optimizers and different optimizer configs
model.compile(loss= 'sparse_categorical_crossentropy', optimizer= 'adam', metrics= ['accuracy'])

model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 14, 32)            31264     
_________________________________________________________________
lstm (LSTM)                  (None, 16)                3136      
_________________________________________________________________
dense (Dense)                (None, 6)                 102       
Total params: 34,502
Trainable params: 34,502
Non-trainable params: 0
_________________________________________________________________


In [49]:
model.fit(X_train, y_train, batch_size= 32, epochs= 20, validation_split= 0.1, verbose= 1)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<tensorflow.python.keras.callbacks.History at 0x7efc93358490>

In [50]:
y_pred = model.predict_classes(X_test)

In [51]:
y_pred_proba = model.predict_proba(X_test)

In [52]:
len(y_pred_proba), len(y_pred), len(y_test)

(163, 163, 163)

In [53]:
score, acc = model.evaluate(X_train, y_train)
print('Train score:', score)
print('Train accuracy:', acc)

Train score: 0.3809475004673004
Train accuracy: 0.8755868673324585


In [54]:
score, acc = model.evaluate(X_test, y_test)
print('Test score:', score)
print('Test accuracy:', acc)

Test score: 0.4655017554759979
Test accuracy: 0.8466257452964783


In [55]:
model.save('noup_census2010_lstm_2grams_adam.h5')

In [56]:
words_df = pd.DataFrame(two_grams, columns=['vocab'])
words_df.to_csv('noup_census2010_lstm_2grams_vocab_adam.csv', index=False, encoding='utf-8')

# Predictions

In [57]:
def isstring(s):
    return isinstance(s, str)
    
def column_exists(df, col):
    """Check the column name exists in the DataFrame.

    Args:
        df (:obj:`DataFrame`): Pandas DataFrame.
        col (str): Column name.

    Returns:
        bool: True if exists, False if not exists.

    """
    if col and (col not in df.columns):
        print("The specify column `{0!s}` not found in the input file".format(col))
        return False
    else:
        return True


def fixup_columns(cols):
    """Replace index location column to name with `col` prefix

    Args:
        cols (list): List of original columns

    Returns:
        list: List of column names

    """
    out_cols = []
    for col in cols:
        if type(col) == int:
            out_cols.append('col{:d}'.format(col))
        else:
            out_cols.append(col)
    return out_cols

def find_ngrams(vocabulary, text):
    """Find and return list of the index of n-grams in the vocabulary list.

    Generate the n-grams of the specific text, find them in the vocabulary list
    and return the list of index have been found.

    Args:
        vocab (:obj:`list`): Vocabulary list.
        text (str): Input text
        n (int): N-grams

    Returns:
        list: List of the index of n-grams in the vocabulary list.

    """

    wi = []

    if not isstring(text):
        return wi

    t = [text[i:i+2] for i in range(len(text)-1)]
    #print(t)
    for i in t:
        w = ''.join(i)
        try:
            idx = vocabulary.index(w)
            #print(idx)
        except:
            idx = 0
        wi.append(idx)
    return wi




In [58]:
MODEL = os.getcwd() + '/noup_census2010_lstm_2grams_adam.h5'
VOCAB = os.getcwd() + '/noup_census2010_lstm_2grams_vocab_adam.csv'

def pred_census(df, col_name):
    
    model = load_model(MODEL)
    # vocab is column name in ngrams_list
    ngrams_list = pd.read_csv(VOCAB)
    # order in which the OHE is done during model building
    race= ['asian', 'black', 'hispanic', 'mtonerace', 'nativeamerican', 'white']
  
    df['last_name'] = df[col_name].str.title()
    
    # (vocab is column name in ngrams_list hence ngrams_list.vocab passed in find_ngrams)
    X = np.array([find_ngrams(list(ngrams_list.vocab), i) for i in df['last_name']])
    #print('X :',X)
    X = sequence.pad_sequences(X, maxlen= 14, padding='post')
    #print('pad X :',X)

    df['pred'] = model.predict_classes(X)
    df['race'] = df['pred'].apply(lambda x : race[int(x)])
    #print("df['pred'] :", df['pred'])
    
    # take out temporary working columns
    del df['pred']
    del df['last_name']
    
    proba = model.predict_proba(X)

    pdf = pd.DataFrame(proba, columns= race)
    #pdf.set_index(df[nn].index, inplace=True)

    rdf = pd.concat([df, pdf], axis=1)

    return rdf


In [59]:
names = [{'name': 'smith'},
         {'name': 'zhang'},
         {'name': 'jackson'},
         {'name': 'snyder'},
         {'name': 'chen'},
         {'name': 'darwin'},
         {'name': 'bharat'},
         {'name':'murali'},
         {'name':'robert'},
         {'name':'kashyap'},
         {'name': 'kate'},
         {'name': 'cooper'},
         {'name': 'bush'},
         ]

In [60]:
dff = pd.DataFrame(names)
dff

Unnamed: 0,name
0,smith
1,zhang
2,jackson
3,snyder
4,chen
5,darwin
6,bharat
7,murali
8,robert
9,kashyap


In [61]:
pred_census(dff, 'name')



Unnamed: 0,name,race,asian,black,hispanic,mtonerace,nativeamerican,white
0,smith,white,0.001212,0.061841,0.000474,1.4e-05,0.002952,0.933506
1,zhang,asian,0.973926,0.006641,0.006459,0.004283,0.000891,0.007799
2,jackson,white,0.002684,0.13326,0.000552,3.6e-05,0.002171,0.861297
3,snyder,white,0.000367,0.249996,0.000464,1.1e-05,0.008142,0.74102
4,chen,asian,0.862841,0.00961,0.041741,0.003431,0.005612,0.076765
5,darwin,white,0.000719,0.024673,6.9e-05,1.7e-05,0.001447,0.973075
6,bharat,asian,0.99141,0.001221,0.003498,0.002741,0.000266,0.000863
7,murali,white,0.310038,0.019206,0.028362,0.000748,0.002399,0.639247
8,robert,white,0.001424,0.018971,0.020227,1e-05,0.001323,0.958045
9,kashyap,asian,0.665915,0.014253,0.000674,0.001482,0.007204,0.310472
