# Data Prep

## Data Loading

In [1]:
import pandas as pd
import numpy as np

In [2]:
# Florida voter file
df = pd.read_csv('../dataverse_files/fl_reg_name_race.csv.gz')
df.dropna(subset=['name_first', 'name_last'], inplace=True)

sdf = df[df.race.isin(['multi_racial', 'native_indian', 'other', 'unknown']) == False]
del df

# Setting consistent case for names
sdf['name_first'] = sdf.name_first.str.title()
sdf['name_last'] = sdf.name_last.str.title()

sdf

Unnamed: 0,name_last,name_first,race
0,Walker,Elizabeth,nh_white
1,Palmer,Alton,nh_white
2,Mc Cleod,Alicia,nh_black
3,Scarborough,Dale,nh_white
4,Walker,Daniel,nh_white
...,...,...,...
13653888,Philpott,April,nh_white
13653889,Walters,William,nh_white
13653890,Sawyer,Matthew,nh_white
13653891,Thomas,Janine,nh_white


In [3]:
# check the different races filtered
sdf.race.value_counts()

nh_white    8714118
hispanic    2174408
nh_black    1847266
asian        253306
Name: race, dtype: int64

In [4]:
# Summing the count of each name & race combination
gdf = sdf.groupby(['name_last','race'], as_index=False)['race'].agg(['count']).reset_index()

In [5]:
# creating a pivot table so that each name has a count of the # of races with that last name
gdf = gdf.pivot_table(values='count', columns='race',index='name_last')

# Converting NaN to zeros since that means there is no one that identifies with that race with that last name
gdf = gdf.fillna(0)

# Getting the totals of each last name
gdf['total_n'] = gdf.sum(axis=1)

gdf['total_norm'] = gdf['total_n']/np.max(gdf['total_n'])
gdf['name_last'] = gdf.index

In [6]:
gdf.head(15)

race,asian,hispanic,nh_black,nh_white,total_n,total_norm,name_last
name_last,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
Fleurime Michel,0.0,0.0,1.0,0.0,1.0,1e-05,Fleurime Michel
Franklin,0.0,0.0,1.0,0.0,1.0,1e-05,Franklin
Grant Cliatt,0.0,0.0,1.0,0.0,1.0,1e-05,Grant Cliatt
Hassan,1.0,0.0,0.0,0.0,1.0,1e-05,Hassan
King,0.0,1.0,0.0,0.0,1.0,1e-05,King
Williams,0.0,0.0,0.0,1.0,1.0,1e-05,Williams
0Kharitonenko,0.0,0.0,0.0,1.0,1.0,1e-05,0Kharitonenko
1Amirthanayagam,1.0,0.0,0.0,0.0,1.0,1e-05,1Amirthanayagam
4R,0.0,0.0,0.0,1.0,1.0,1e-05,4R
77348 Dancing Rochanavibhata,1.0,0.0,0.0,0.0,1.0,1e-05,77348 Dancing Rochanavibhata


In [7]:
races = sdf.race.unique().tolist()
races

['nh_white', 'nh_black', 'hispanic', 'asian']

In [8]:
def calc_prop(row):
    total = row['total_n']
    values = [(i/total) for i in row]
    return pd.Series(values)

In [9]:
temp = races
temp.append('total_n')
print(temp)
gdf[races] = gdf[temp].apply(calc_prop, axis=1)

['nh_white', 'nh_black', 'hispanic', 'asian', 'total_n']


In [10]:
gdf #= gdf[races] / gdf['total_n']

race,asian,hispanic,nh_black,nh_white,total_n,total_norm,name_last
name_last,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
Fleurime Michel,0.0,0.0,1.0,0.0,1.0,0.000010,Fleurime Michel
Franklin,0.0,0.0,1.0,0.0,1.0,0.000010,Franklin
Grant Cliatt,0.0,0.0,1.0,0.0,1.0,0.000010,Grant Cliatt
Hassan,1.0,0.0,0.0,0.0,1.0,0.000010,Hassan
King,0.0,1.0,0.0,0.0,1.0,0.000010,King
...,...,...,...,...,...,...,...
Zyzanski,0.0,0.0,0.0,1.0,1.0,0.000010,Zyzanski
Zyzdryn,0.0,0.0,0.0,1.0,1.0,0.000019,Zyzdryn
Zyznomyrsky,0.0,0.0,0.0,1.0,1.0,0.000010,Zyznomyrsky
Zzaman,1.0,0.0,0.0,0.0,1.0,0.000010,Zzaman


In [11]:
gdf.drop('total_n', axis=1, inplace=True)
gdf.columns

Index(['asian', 'hispanic', 'nh_black', 'nh_white', 'total_norm', 'name_last'], dtype='object', name='race')

## Data Processing

In [12]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer                                                             
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import Normalizer
from sklearn.metrics import classification_report
from sklearn.metrics.pairwise import cosine_similarity
from keras.preprocessing import sequence


NGRAMS = 2
feature_len = 25

In [13]:
proto_df = gdf.sample(frac=0.1, random_state=10)
proto_df.shape

(84983, 6)

In [14]:
# build n-gram list
vect = CountVectorizer(analyzer='char', max_df=0.3, min_df=3, ngram_range=(NGRAMS, NGRAMS), lowercase=False) 

# **********
# **** CHANGE THIS TO FULL DATAFRAME WHEN READY FOR FULL DATASET ****
a = vect.fit_transform(proto_df.name_last)  
# **********

vocab = vect.vocabulary_

In [15]:
# n-gram with freq without sorting 
words = []
for b in vocab:
    c = vocab[b]
    words.append((a[:, c].sum(), b))

words_list = [w[1] for w in words]
num_words = len(words_list)
print("num_words = %d" % num_words)

num_words = 1096


In [16]:
def find_ngrams(text, n):
    a = zip(*[text[i:] for i in range(n)])
    wi = []
    for i in a:
        w = ''.join(i)
        try:
            idx = words_list.index(w)
        except:
            idx = 0
        wi.append(idx)
    return wi

In [17]:
# build X from index of n-gram sequence
X = np.array(proto_df.name_last.apply(lambda c: find_ngrams(c, NGRAMS)))
y = np.array(proto_df.iloc[:,:-1])

# Generating an equal length sequence of Xs
X = sequence.pad_sequences(X, maxlen=feature_len)

In [18]:
X_train, y_train, X_test, y_test = train_test_split(X, y, test_size=0.1, random_state=10)

# TF-IDF Transform

In [19]:
tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(a)

In [20]:
def get_cosine_similarity(vectorizer, tfidf_transformer, name ):
    query_tfidf = vectorizer.transform([name])
    cosineSimilarities = cosine_similarity(query_tfidf, tfidf_transformer).flatten()
    return cosineSimilarities

In [21]:
dim = proto_df['name_last'].shape[0]

In [22]:
cos_similarity = np.zeros((dim,dim))

In [23]:
for i in range(dim):
    vect_similarity = get_cosine_similarity(vect, X_train_tfidf, proto_df.iloc[i,-1] )
    if (np.where(vect_similarity <= 0.6) and np.where(vect_similarity >= 0)):
        cos_similarity[i] = vect_similarity
    else:
        print (i , ' vector had a large cosine distance')

In [24]:
cos_similarity[:5]

array([[0.96736333, 0.        , 0.        , ..., 0.11255414, 0.        ,
        0.        ],
       [0.        , 0.96030389, 0.        , ..., 0.        , 0.        ,
        0.10219411],
       [0.        , 0.        , 0.97979594, ..., 0.        , 0.08398708,
        0.11614398],
       [0.        , 0.13815852, 0.09508934, ..., 0.04973842, 0.12278559,
        0.06940931],
       [0.        , 0.18268301, 0.        , ..., 0.        , 0.        ,
        0.        ]])

1. create matrix of bi-chars
2. we can do the tf-idf transform
3. filter
4. using python-Levenshtein (or fuzzywuzzy), calculate levenshtein for the filtered set
5. filter again to levenshtein < 
6. if filtered set > k, then we use all the returned set (so really the mo is to always use the entire returned set) and caculate prop_white, asian, etc., etc.