### Predict State of Residence of Unseen Last Name Using KNN (Cosine Distance and Levenshtein)

In [1]:
import pandas as pd
import numpy as np

import collections
import Levenshtein as lv
import multiprocessing as mp
from tqdm import tqdm

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer                                                             
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics.pairwise import cosine_similarity

In [2]:
all_dat = pd.read_csv("../../data/instate_processed_clean.csv.gz")

In [3]:
all_dat.shape

(421190808, 3)

In [4]:
gdf = all_dat.groupby(['last_name','state'], as_index = False)['state'].agg(['count'])

In [5]:
# creating a pivot table so that each name has a count of the # of states with that last name
gdf = gdf.pivot_table(values = 'count', columns = 'state', index = 'last_name')

# Converting NaN to zeros since that means there is no one that lives in that state with that last name
gdf = gdf.fillna(0)

# Getting the totals of each last name
gdf['total_n'] = gdf.sum(axis = 1)
gdf.reset_index(inplace = True)

In [9]:
%%time
# Calculate the proportion of people with a particular last name that live in various states
gdf.iloc[:, 1:] = gdf.iloc[:, 1:].div(gdf.total_n, axis = 0)
gdf.to_csv("../../data/instate_unique_ln_state_prop_v1.csv.gz",
           compression='gzip')
gdf

CPU times: user 27.4 s, sys: 515 ms, total: 28 s
Wall time: 27.7 s


state,last_name,andaman,andhra,arunachal,assam,bihar,chandigarh,dadra,daman,delhi,...,odi,puducherry,punjab,rajasthan,sikkim,tel,tripura,up,utt,total_n
0,aaa,0.0,0.000136,0.0,0.000000,0.0,0.0,0.0,0.0,0.017176,...,0.054253,0.0,0.0,0.0,0.001772,0.01813,0.0,0.859733,0.016903,1.0
1,aaaa,0.0,0.000000,0.0,0.003175,0.0,0.0,0.0,0.0,0.000000,...,0.009524,0.0,0.0,0.0,0.000000,0.00000,0.0,0.942857,0.028571,1.0
2,aaaaa,0.0,0.000000,0.0,0.000000,0.0,0.0,0.0,0.0,0.000000,...,0.000000,0.0,0.0,0.0,0.000000,0.00000,0.0,1.000000,0.000000,1.0
3,aaaaaa,0.0,0.000000,0.0,0.000000,0.0,0.0,0.0,0.0,0.000000,...,0.000000,0.0,0.0,0.0,0.000000,0.00000,0.0,0.996815,0.000000,1.0
4,aaaabaaraav,0.0,0.000000,0.0,0.000000,0.0,0.0,0.0,0.0,0.000000,...,0.000000,0.0,0.0,0.0,0.000000,0.00000,0.0,0.000000,0.000000,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1140998,ൟadiaaa,0.0,0.000000,0.0,0.000000,0.0,0.0,0.0,0.0,0.000000,...,1.000000,0.0,0.0,0.0,0.000000,0.00000,0.0,0.000000,0.000000,1.0
1140999,ൟaraaia,0.0,0.000000,0.0,0.000000,0.0,0.0,0.0,0.0,0.000000,...,1.000000,0.0,0.0,0.0,0.000000,0.00000,0.0,0.000000,0.000000,1.0
1141000,ൟasadeia,0.0,0.000000,0.0,0.000000,0.0,0.0,0.0,0.0,0.000000,...,1.000000,0.0,0.0,0.0,0.000000,0.00000,0.0,0.000000,0.000000,1.0
1141001,ൟithi,0.0,0.000000,0.0,0.000000,0.0,0.0,0.0,0.0,0.000000,...,1.000000,0.0,0.0,0.0,0.000000,0.00000,0.0,0.000000,0.000000,1.0


In [7]:
gdf['modal_state'] = gdf[gdf.columns[1:-1]].idxmax(axis=1)
gdf['modal_state'].head()

0    kerala
1        up
2        up
3        up
4        up
Name: modal_state, dtype: object

In [8]:
gdf.columns

Index(['last_name', 'andaman', 'andhra', 'arunachal', 'assam', 'bihar',
       'chandigarh', 'dadra', 'daman', 'delhi', 'goa', 'guj', 'har', 'jha',
       'jk', 'kar', 'kerala', 'maharashtra', 'manipur', 'meghalaya', 'mizoram',
       'mp', 'nagaland', 'odi', 'puducherry', 'punjab', 'rajasthan', 'sikkim',
       'tel', 'tripura', 'up', 'utt', 'total_n', 'modal_state'],
      dtype='object', name='state')

In [9]:
proto_df = gdf.groupby('modal_state', group_keys=False).apply(lambda x: x.sample(frac=1, random_state=10))
proto_df.reset_index(inplace=True)
proto_df.drop('index', axis=1, inplace=True)
proto_df.shape

(1128299, 34)

In [10]:
proto_df.modal_state.value_counts()

maharashtra    341179
andhra         229327
guj            136235
kar             62541
odi             49085
bihar           37957
up              36526
delhi           35801
kerala          34938
mp              33698
manipur         25400
jha             15991
assam           11566
goa              9783
mizoram          9296
rajasthan        9106
arunachal        7000
tel              5944
nagaland         5447
har              4703
punjab           4672
utt              3835
dadra            3312
meghalaya        3189
tripura          3075
puducherry       2421
sikkim           1755
daman            1753
jk               1094
andaman           896
chandigarh        774
Name: modal_state, dtype: int64

In [11]:
#test_agg = test.groupby(['last_name'])['state'].value_counts(normalize=True).unstack(-1).fillna(0).reset_index()
#top_three = test_agg.set_index('last_name')
#top_three = pd.DataFrame(top_three.columns.values[np.argsort(-top_three.values, axis=1)[:, :3]], 
#                  index=top_three.index,
#                 columns = ['1st Max','2nd Max','3rd Max']).reset_index()
#top_three.head()

In [12]:
%%time
# build n-gram list
NGRAMS = 2
vect = CountVectorizer(analyzer='char', max_df=0.5, min_df=.005, ngram_range=(NGRAMS, NGRAMS), lowercase=False) 
tfidf_transformer = TfidfTransformer()

a = vect.fit_transform(proto_df.last_name) 
tfidf = tfidf_transformer.fit_transform(a)

vocab = vect.vocabulary_

CPU times: user 6.04 s, sys: 68.2 ms, total: 6.11 s
Wall time: 6.15 s


In [13]:
words = []
for b in vocab:
    c = vocab[b]
    words.append((a[:, c].sum(), b))

words_list = [w[1] for w in words]
print(words_list[1:10])
num_words = len(words_list)
print("num_words = %d" % num_words)

['ha', 'at', 'va', 'bi', 'in', 'nd', 'du', 'uj', 'ja']
num_words = 195


In [14]:
def find_ngrams(text, n):
    a = zip(*[text[i:] for i in range(n)])
    wi = []
    for i in a:
        w = ''.join(i)
        try:
            idx = words_list.index(w)
        except:
            idx = 0
        wi.append(idx)
    return wi

In [15]:
# build tf-idf vectors
proto_df['tfidf_index'] = proto_df.index

In [16]:
train_df, test_df = train_test_split(proto_df, test_size=.05, random_state= 10)
train_df, valid_df = train_test_split(train_df, test_size=.05, random_state= 10)

train_df.reset_index(inplace=True)
valid_df.reset_index(inplace=True)
test_df.reset_index(inplace=True)

print('Training set size: {}'.format(train_df.shape))
print('Validation set size: {}'.format(valid_df.shape))
print('Test set size: {}'.format(test_df.shape))

Training set size: (1018289, 36)
Validation set size: (53595, 36)
Test set size: (56415, 36)


In [17]:
def predict_cosine_state(arg):
    # reading the tuple passed on by the calling function
    idx, row_data, test_df, corpus_df, corp_vector, k = arg
    
    # resizing the tf-idf (1, m) & corpus vectors to be (n, m)
    #  n = number of samples
    #  m = number of dimentions
    orig_vector = tfidf[row_data['tfidf_index']].reshape(1, -1)

    # calculating the cosine similarity beteween the name vector
    #   and the corpus vectors.  Then filtering for only values
    #   that are greater that what was passed on
    cossim = cosine_similarity(orig_vector, corp_vector)
        
    # Order by cosine distance and pick top k
    cossim_df = corpus_df.iloc[np.flip(cossim.flatten().argsort())[:k]]
    
    pred_state = states[cossim_df[states].mean().argmax()]
    test_df.loc[idx, 'pred_state'] = pred_state
    return pred_state

def check_cosine_k(test_df, corpus_df, k):
    results = []

    num_cpu = mp.cpu_count() 
    pool = mp.pool.ThreadPool(processes=8)

    corp_vector = tfidf[corpus_df['tfidf_index']]

    # for idx, row in tqdm(test_df.iterrows()):
    r = pool.map(predict_cosine_state, [(idx, row, test_df, corpus_df, corp_vector, k)
                                for idx, row in test_df.iterrows()])
    results.append(r)

    pool.close()
    pool.join()

    return results

In [18]:
states = gdf.columns[1:-2]
states

Index(['andaman', 'andhra', 'arunachal', 'assam', 'bihar', 'chandigarh',
       'dadra', 'daman', 'delhi', 'goa', 'guj', 'har', 'jha', 'jk', 'kar',
       'kerala', 'maharashtra', 'manipur', 'meghalaya', 'mizoram', 'mp',
       'nagaland', 'odi', 'puducherry', 'punjab', 'rajasthan', 'sikkim', 'tel',
       'tripura', 'up', 'utt'],
      dtype='object', name='state')

In [19]:
k_metrics = {
    3:0,
    5:0,
    10:0
}

In [20]:
true_list = []
for idx, row in valid_df.iterrows():
    true_list.append(row['modal_state'])

In [21]:
%%time
for value, key in enumerate (k_metrics):
    #print ('{} -- {}'.format(key, value))
    result = check_cosine_k(valid_df, train_df, key)
    
    pred_list = np.array(result).reshape(-1)
    pred_list = pred_list.tolist()
    
    true_list = pd.Series(true_list).tolist() #.replace(replacement).to_list()
    pred_list = pd.Series(pred_list).tolist() #.replace(replacement).to_list()
    
    value = classification_report(true_list, pred_list, zero_division = 0)
    
    print ('for value of k: {} \n{}'.format(key, value))
    k_metrics[key] = value

for value of k: 3 
              precision    recall  f1-score   support

     andaman       0.04      0.02      0.03        47
      andhra       0.72      0.83      0.77     10917
   arunachal       0.28      0.24      0.26       339
       assam       0.33      0.29      0.31       572
       bihar       0.41      0.30      0.34      1781
  chandigarh       0.00      0.00      0.00        41
       dadra       0.41      0.21      0.27       150
       daman       0.16      0.09      0.12        85
       delhi       0.41      0.44      0.42      1695
         goa       0.40      0.29      0.33       470
         guj       0.80      0.75      0.77      6479
         har       0.18      0.06      0.09       233
         jha       0.37      0.28      0.32       784
          jk       0.26      0.22      0.23        51
         kar       0.66      0.60      0.63      2974
      kerala       0.45      0.41      0.43      1653
 maharashtra       0.69      0.77      0.73     16311
     man

In [22]:
%%time
result = check_cosine_k(test_df, train_df, 5)

CPU times: user 4h 10min 21s, sys: 11min 29s, total: 4h 21min 51s
Wall time: 1h 43min 34s


In [23]:
pred_list = np.array(result).reshape(-1)
pred_list = pred_list.tolist()

true_list = []
for idx, row in test_df.iterrows():
    true_list.append(row['modal_state'])

true_list = pd.Series(true_list).tolist() #.replace(replacement).to_list()
pred_list = pd.Series(pred_list).tolist() #.replace(replacement).to_list()

value = classification_report(true_list, pred_list, zero_division = 0)

print (value)

              precision    recall  f1-score   support

     andaman       0.09      0.02      0.03        50
      andhra       0.73      0.83      0.78     11249
   arunachal       0.34      0.21      0.26       348
       assam       0.46      0.31      0.37       553
       bihar       0.44      0.28      0.34      1880
  chandigarh       0.00      0.00      0.00        47
       dadra       0.60      0.21      0.31       145
       daman       0.38      0.10      0.15        93
       delhi       0.45      0.48      0.46      1799
         goa       0.55      0.22      0.31       541
         guj       0.81      0.74      0.78      6967
         har       0.21      0.05      0.08       248
         jha       0.38      0.26      0.31       766
          jk       0.18      0.07      0.10        46
         kar       0.69      0.59      0.64      3150
      kerala       0.48      0.42      0.45      1712
 maharashtra       0.67      0.82      0.74     17104
     manipur       0.65    