### Predict State of Residence of Unseen Last Name Using KNN (Cosine Distance and Levenshtein)

In [1]:
import pandas as pd
import numpy as np

import collections
import Levenshtein as lv
import multiprocessing as mp
from tqdm import tqdm

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer                                                             
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics.pairwise import cosine_similarity

In [2]:
#all_dat = pd.read_csv('instate_processed_clean.csv.gz', nrows=1000000, usecols=['state', 'last_name'])
all_dat = pd.read_csv('instate_processed_clean.csv.gz', usecols=['state', 'last_name'])
all_dat

Unnamed: 0,state,last_name
0,andaman,datta
1,andaman,devi
2,andaman,krishna
3,andaman,sekhar
4,andaman,toppo
...,...,...
421190803,utt,chouhaan
421190804,utt,chouhaan
421190805,utt,kumaar
421190806,utt,raanee


In [3]:
all_dat.shape

(421190808, 2)

In [4]:
gdf = all_dat.groupby(['last_name','state'], as_index = False)['state'].agg(['count'])

In [5]:
# creating a pivot table so that each name has a count of the # of states with that last name
gdf = gdf.pivot_table(values = 'count', columns = 'state', index = 'last_name')

# Converting NaN to zeros since that means there is no one that lives in that state with that last name
gdf = gdf.fillna(0)

# Getting the totals of each last name
gdf['total_n'] = gdf.sum(axis = 1)
gdf.reset_index(inplace = True)

In [6]:
%%time
# Calculate the proportion of people with a particular last name that live in various states
gdf.iloc[:, 1:] = gdf.iloc[:, 1:].div(gdf.total_n, axis = 0)
gdf

CPU times: user 234 ms, sys: 0 ns, total: 234 ms
Wall time: 229 ms


state,last_name,andaman,andhra,arunachal,assam,bihar,chandigarh,dadra,daman,delhi,...,odi,puducherry,punjab,rajasthan,sikkim,tel,tripura,up,utt,total_n
0,aaa,0.0,0.000136,0.0,0.000000,0.0,0.0,0.0,0.0,0.017176,...,0.054253,0.0,0.0,0.0,0.001772,0.01813,0.0,0.859733,0.016903,1.0
1,aaaa,0.0,0.000000,0.0,0.003175,0.0,0.0,0.0,0.0,0.000000,...,0.009524,0.0,0.0,0.0,0.000000,0.00000,0.0,0.942857,0.028571,1.0
2,aaaaa,0.0,0.000000,0.0,0.000000,0.0,0.0,0.0,0.0,0.000000,...,0.000000,0.0,0.0,0.0,0.000000,0.00000,0.0,1.000000,0.000000,1.0
3,aaaaaa,0.0,0.000000,0.0,0.000000,0.0,0.0,0.0,0.0,0.000000,...,0.000000,0.0,0.0,0.0,0.000000,0.00000,0.0,0.996815,0.000000,1.0
4,aaaabaaraav,0.0,0.000000,0.0,0.000000,0.0,0.0,0.0,0.0,0.000000,...,0.000000,0.0,0.0,0.0,0.000000,0.00000,0.0,0.000000,0.000000,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1140998,ൟadiaaa,0.0,0.000000,0.0,0.000000,0.0,0.0,0.0,0.0,0.000000,...,1.000000,0.0,0.0,0.0,0.000000,0.00000,0.0,0.000000,0.000000,1.0
1140999,ൟaraaia,0.0,0.000000,0.0,0.000000,0.0,0.0,0.0,0.0,0.000000,...,1.000000,0.0,0.0,0.0,0.000000,0.00000,0.0,0.000000,0.000000,1.0
1141000,ൟasadeia,0.0,0.000000,0.0,0.000000,0.0,0.0,0.0,0.0,0.000000,...,1.000000,0.0,0.0,0.0,0.000000,0.00000,0.0,0.000000,0.000000,1.0
1141001,ൟithi,0.0,0.000000,0.0,0.000000,0.0,0.0,0.0,0.0,0.000000,...,1.000000,0.0,0.0,0.0,0.000000,0.00000,0.0,0.000000,0.000000,1.0


In [7]:
gdf['modal_state'] = gdf[gdf.columns[1:-1]].idxmax(axis=1)
gdf['modal_state'].head()

0             up
1             up
2             up
3             up
4    maharashtra
Name: modal_state, dtype: object

In [8]:
gdf.columns

Index(['last_name', 'andaman', 'andhra', 'arunachal', 'assam', 'bihar',
       'chandigarh', 'dadra', 'daman', 'delhi', 'goa', 'guj', 'har', 'jha',
       'jk', 'kar', 'kerala', 'maharashtra', 'manipur', 'meghalaya', 'mizoram',
       'mp', 'nagaland', 'odi', 'puducherry', 'punjab', 'rajasthan', 'sikkim',
       'tel', 'tripura', 'up', 'utt', 'total_n', 'modal_state'],
      dtype='object', name='state')

In [9]:
#proto_df = gdf.groupby('modal_state', group_keys=False).apply(lambda x: x.sample(frac=1, random_state=10))
proto_df = gdf.groupby('modal_state', group_keys=False).apply(lambda x: x.sample(frac=.1, random_state=10))
proto_df.reset_index(inplace=True)
proto_df.drop('index', axis=1, inplace=True)
proto_df.shape

(114099, 34)

In [10]:
proto_df.modal_state.value_counts()

maharashtra    34303
andhra         23500
guj            13619
kar             6412
odi             4940
bihar           3828
delhi           3744
kerala          3567
up              3516
mp              3379
manipur         2603
jha             1617
assam           1165
goa             1009
mizoram          937
rajasthan        914
arunachal        745
nagaland         560
har              482
punjab           472
tel              457
utt              386
dadra            374
meghalaya        329
tripura          306
puducherry       259
daman            188
sikkim           180
andaman          115
jk               113
chandigarh        80
Name: modal_state, dtype: int64

In [11]:
#test_agg = test.groupby(['last_name'])['state'].value_counts(normalize=True).unstack(-1).fillna(0).reset_index()
#top_three = test_agg.set_index('last_name')
#top_three = pd.DataFrame(top_three.columns.values[np.argsort(-top_three.values, axis=1)[:, :3]], 
#                  index=top_three.index,
#                 columns = ['1st Max','2nd Max','3rd Max']).reset_index()
#top_three.head()

In [12]:
%%time
# build n-gram list
NGRAMS = 2
vect = CountVectorizer(analyzer='char', max_df=0.5, min_df=.005, ngram_range=(NGRAMS, NGRAMS), lowercase=False) 
tfidf_transformer = TfidfTransformer()

a = vect.fit_transform(proto_df.last_name) 
tfidf = tfidf_transformer.fit_transform(a)

vocab = vect.vocabulary_

CPU times: user 988 ms, sys: 0 ns, total: 988 ms
Wall time: 987 ms


In [13]:
words = []
for b in vocab:
    c = vocab[b]
    words.append((a[:, c].sum(), b))

words_list = [w[1] for w in words]
print(words_list[1:10])
num_words = len(words_list)
print("num_words = %d" % num_words)

['ut', 'th', 'hu', 'ul', 'la', 'mi', 'me', 'ee', 'en']
num_words = 197


In [14]:
def find_ngrams(text, n):
    a = zip(*[text[i:] for i in range(n)])
    wi = []
    for i in a:
        w = ''.join(i)
        try:
            idx = words_list.index(w)
        except:
            idx = 0
        wi.append(idx)
    return wi

In [15]:
# build tf-idf vectors
proto_df['tfidf_index'] = proto_df.index

In [16]:
train_df, test_df = train_test_split(proto_df, test_size=.05, random_state= 10)
train_df, valid_df = train_test_split(train_df, test_size=.05, random_state= 10)

train_df.reset_index(inplace=True)
valid_df.reset_index(inplace=True)
test_df.reset_index(inplace=True)

print('Training set size: {}'.format(train_df.shape))
print('Validation set size: {}'.format(valid_df.shape))
print('Test set size: {}'.format(test_df.shape))

Training set size: (102974, 36)
Validation set size: (5420, 36)
Test set size: (5705, 36)


In [26]:
def predict_cosine_state(arg):
    # reading the tuple passed on by the calling function
    idx, row_data, test_df, corpus_df, corp_vector, k = arg
    
    # resizing the tf-idf (1, m) & corpus vectors to be (n, m)
    #  n = number of samples
    #  m = number of dimentions
    orig_vector = tfidf[row_data['tfidf_index']].reshape(1, -1)

    # calculating the cosine similarity beteween the name vector
    #   and the corpus vectors.  Then filtering for only values
    #   that are greater that what was passed on
    cossim = cosine_similarity(orig_vector, corp_vector)
        
    # Order by cosine distance and pick top k
    cossim_df = corpus_df.iloc[np.flip(cossim.flatten().argsort())[:k]]
    
    top3_idx = cossim_df[states].mean().argsort()[-3:]
    pred_state1 = states[top3_idx[2]]
    pred_state2 = states[top3_idx[1]]
    pred_state3 = states[top3_idx[0]]
    test_df.loc[idx, 'pred_state1'] = pred_state1
    test_df.loc[idx, 'pred_state2'] = pred_state2
    test_df.loc[idx, 'pred_state3'] = pred_state3

    return pred_state1, pred_state2, pred_state3

def check_cosine_k(test_df, corpus_df, k):
    results = []

    num_cpu = mp.cpu_count() 
    pool = mp.pool.ThreadPool(processes=8)

    corp_vector = tfidf[corpus_df['tfidf_index']]

    # for idx, row in tqdm(test_df.iterrows()):
    r = pool.map(predict_cosine_state, [(idx, row, test_df, corpus_df, corp_vector, k)
                                for idx, row in test_df.iterrows()])
    results.append(r)

    pool.close()
    pool.join()

    return results

In [18]:
states = gdf.columns[1:-2]
states

Index(['andaman', 'andhra', 'arunachal', 'assam', 'bihar', 'chandigarh',
       'dadra', 'daman', 'delhi', 'goa', 'guj', 'har', 'jha', 'jk', 'kar',
       'kerala', 'maharashtra', 'manipur', 'meghalaya', 'mizoram', 'mp',
       'nagaland', 'odi', 'puducherry', 'punjab', 'rajasthan', 'sikkim', 'tel',
       'tripura', 'up', 'utt'],
      dtype='object', name='state')

In [19]:
k_metrics = {
    3:0,
    5:0,
    10:0
}

In [43]:
true_list = []
for idx, row in valid_df.iterrows():
    true_list.append(row['modal_state'])

In [44]:
%%time
for value, key in enumerate (k_metrics):
    #print ('{} -- {}'.format(key, value))
    result = check_cosine_k(valid_df, train_df, key)
    
    #pred_list = np.array(result).reshape(-1)
    pred_list = np.array(result[0])[:,0]
    pred_list = pred_list.tolist()
    
    true_list = pd.Series(true_list).tolist() #.replace(replacement).to_list()
    pred_list = pd.Series(pred_list).tolist() #.replace(replacement).to_list()
    
    value = classification_report(true_list, pred_list, zero_division = 0)
    
    print ('for value of k: {} \n{}'.format(key, value))
    k_metrics[key] = value

for value of k: 3 
              precision    recall  f1-score   support

     andaman       0.00      0.00      0.00         4
      andhra       0.66      0.73      0.69      1095
   arunachal       0.19      0.15      0.17        33
       assam       0.27      0.19      0.22        63
       bihar       0.28      0.16      0.21       189
  chandigarh       0.00      0.00      0.00         5
       dadra       0.33      0.10      0.15        21
       daman       0.11      0.09      0.10        11
       delhi       0.29      0.27      0.28       181
         goa       0.12      0.07      0.09        42
         guj       0.73      0.67      0.70       647
         har       0.00      0.00      0.00        22
         jha       0.15      0.08      0.11        71
          jk       0.00      0.00      0.00         7
         kar       0.48      0.46      0.47       292
      kerala       0.24      0.29      0.27       156
 maharashtra       0.61      0.71      0.65      1656
     man

In [46]:
len(pred_list), len(true_list)

(5420, 5420)

In [47]:
%%time
result = check_cosine_k(test_df, train_df, 5)

CPU times: user 4min, sys: 48.2 s, total: 4min 48s
Wall time: 2min 6s


## TOP1

In [48]:
#pred_list = np.array(result).reshape(-1)
pred_list = np.array(result[0])[:,0]
pred_list = pred_list.tolist()

true_list = []
for idx, row in test_df.iterrows():
    true_list.append(row['modal_state'])

true_list = pd.Series(true_list).tolist() #.replace(replacement).to_list()
pred_list = pd.Series(pred_list).tolist() #.replace(replacement).to_list()

value = classification_report(true_list, pred_list, zero_division = 0)

print (value)

              precision    recall  f1-score   support

     andaman       0.00      0.00      0.00         4
      andhra       0.67      0.76      0.71      1149
   arunachal       0.22      0.13      0.16        31
       assam       0.21      0.11      0.14        66
       bihar       0.42      0.19      0.26       195
  chandigarh       0.00      0.00      0.00         2
       dadra       0.00      0.00      0.00        12
       daman       0.00      0.00      0.00         9
       delhi       0.36      0.30      0.33       204
         goa       0.45      0.16      0.23        58
         guj       0.72      0.67      0.70       698
         har       0.00      0.00      0.00        21
         jha       0.18      0.07      0.11        80
          jk       0.00      0.00      0.00         4
         kar       0.61      0.41      0.49       333
      kerala       0.29      0.29      0.29       148
 maharashtra       0.56      0.77      0.65      1684
     manipur       0.51    

## TOP2

In [49]:
#pred_list = np.array(result).reshape(-1)
pred_list = np.array(result[0])[:,1]
pred_list = pred_list.tolist()

true_list = []
for idx, row in test_df.iterrows():
    true_list.append(row['modal_state'])

true_list = pd.Series(true_list).tolist() #.replace(replacement).to_list()
pred_list = pd.Series(pred_list).tolist() #.replace(replacement).to_list()

value = classification_report(true_list, pred_list, zero_division = 0)

print (value)

              precision    recall  f1-score   support

     andaman       0.00      0.00      0.00         4
      andhra       0.18      0.10      0.13      1149
   arunachal       0.03      0.06      0.04        31
       assam       0.14      0.17      0.15        66
       bihar       0.11      0.16      0.13       195
  chandigarh       0.00      0.00      0.00         2
       dadra       0.08      0.25      0.12        12
       daman       0.00      0.00      0.00         9
       delhi       0.12      0.15      0.13       204
         goa       0.07      0.07      0.07        58
         guj       0.18      0.17      0.18       698
         har       0.03      0.05      0.04        21
         jha       0.08      0.10      0.09        80
          jk       0.50      0.25      0.33         4
         kar       0.17      0.20      0.18       333
      kerala       0.10      0.20      0.13       148
 maharashtra       0.20      0.11      0.14      1684
     manipur       0.14    

## TOP3

In [50]:
#pred_list = np.array(result).reshape(-1)
pred_list = np.array(result[0])[:,2]
pred_list = pred_list.tolist()

true_list = []
for idx, row in test_df.iterrows():
    true_list.append(row['modal_state'])

true_list = pd.Series(true_list).tolist() #.replace(replacement).to_list()
pred_list = pd.Series(pred_list).tolist() #.replace(replacement).to_list()

value = classification_report(true_list, pred_list, zero_division = 0)

print (value)

              precision    recall  f1-score   support

     andaman       0.00      0.00      0.00         4
      andhra       0.12      0.04      0.06      1149
   arunachal       0.04      0.10      0.06        31
       assam       0.05      0.08      0.06        66
       bihar       0.07      0.14      0.10       195
  chandigarh       0.00      0.00      0.00         2
       dadra       0.05      0.17      0.08        12
       daman       0.04      0.11      0.06         9
       delhi       0.08      0.10      0.09       204
         goa       0.07      0.10      0.08        58
         guj       0.09      0.05      0.07       698
         har       0.00      0.00      0.00        21
         jha       0.04      0.09      0.06        80
          jk       0.00      0.00      0.00         4
         kar       0.05      0.10      0.07       333
      kerala       0.09      0.12      0.10       148
 maharashtra       0.14      0.04      0.06      1684
     manipur       0.07    