In [117]:
from collections import Counter
import itertools
import string as string

import nltk
from nltk.corpus import stopwords
import numpy as np
import pandas as pd
from scipy import sparse
from scipy.sparse import linalg 
from sklearn.preprocessing import normalize
from sklearn.metrics.pairwise import cosine_similarity

In [128]:
df = []
with open('./input/bible.txt') as f:
    for line in f:
        df.append(line[0:(len(line)-1)])
df[0:5]

['In the beginning God created the heaven and the earth.',
 'And the earth was without form, and void; and darkness was upon the face of the deep. And the Spirit of God moved upon the face of the waters.',
 'And God said, Let there be light: and there was light.',
 'And God saw the light, that it was good: and God divided the light from the darkness.',
 'And God called the light Day, and the darkness he called Night. And the evening and the morning were the first day.']

In [136]:
headlines = df

In [130]:
#df = pd.read_csv('./input/abcnews-date-text.csv')
#df = pd.read_csv('./input/Amazon_Unlocked_Mobile.csv')
#df.head()

In [67]:
#df.headline_text
#df.Reviews

In [137]:
string.punctuation

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [138]:
#headlines = df['headline_text'].tolist()

In [139]:
headlines = [str(headline) for headline in headlines]

In [140]:
table = str.maketrans('', '', string.punctuation)
headlines = [headline.translate(table) for headline in headlines]

In [141]:
headlines = [headline.lower() for headline in headlines]

In [142]:
headlines[0:5]

['in the beginning god created the heaven and the earth',
 'and the earth was without form and void and darkness was upon the face of the deep and the spirit of god moved upon the face of the waters',
 'and god said let there be light and there was light',
 'and god saw the light that it was good and god divided the light from the darkness',
 'and god called the light day and the darkness he called night and the evening and the morning were the first day']

In [143]:
#headlines = df['headline_text'].tolist()
# remove stopwords
stopwords_set = set(stopwords.words('english'))
headlines = [
    [tok for tok in headline.split() if tok not in stopwords_set] for headline in headlines
]
# remove single word headlines
headlines = [hl for hl in headlines if len(hl) > 1]
# show results
headlines[0:20]

[['beginning', 'god', 'created', 'heaven', 'earth'],
 ['earth',
  'without',
  'form',
  'void',
  'darkness',
  'upon',
  'face',
  'deep',
  'spirit',
  'god',
  'moved',
  'upon',
  'face',
  'waters'],
 ['god', 'said', 'let', 'light', 'light'],
 ['god', 'saw', 'light', 'good', 'god', 'divided', 'light', 'darkness'],
 ['god',
  'called',
  'light',
  'day',
  'darkness',
  'called',
  'night',
  'evening',
  'morning',
  'first',
  'day'],
 ['god',
  'said',
  'let',
  'firmament',
  'midst',
  'waters',
  'let',
  'divide',
  'waters',
  'waters'],
 ['god',
  'made',
  'firmament',
  'divided',
  'waters',
  'firmament',
  'waters',
  'firmament'],
 ['god',
  'called',
  'firmament',
  'heaven',
  'evening',
  'morning',
  'second',
  'day'],
 ['god',
  'said',
  'let',
  'waters',
  'heaven',
  'gathered',
  'together',
  'unto',
  'one',
  'place',
  'let',
  'dry',
  'land',
  'appear'],
 ['god',
  'called',
  'dry',
  'land',
  'earth',
  'gathering',
  'together',
  'waters',


In [144]:
tok2indx = dict()
unigram_counts = Counter()
for ii, headline in enumerate(headlines):
    if ii % 200000 == 0:
        print(f'finished {ii/len(headlines):.2%} of headlines')
    for token in headline:
        unigram_counts[token] += 1
        if token not in tok2indx:
            tok2indx[token] = len(tok2indx)
indx2tok = {indx:tok for tok,indx in tok2indx.items()}
print('done')
print('vocabulary size: {}'.format(len(unigram_counts)))
print('most common: {}'.format(unigram_counts.most_common(10)))
vocab = len(unigram_counts)

finished 0.00% of headlines
done
vocabulary size: 12603
most common: [('shall', 9838), ('unto', 8997), ('lord', 7830), ('thou', 5474), ('thy', 4600), ('god', 4443), ('said', 3999), ('ye', 3982), ('thee', 3826), ('upon', 2748)]


In [145]:
# note add dynammic window hyperparameter
back_window = 2
front_window = 2
skipgram_counts = Counter()
for iheadline, headline in enumerate(headlines):
    for ifw, fw in enumerate(headline):
        icw_min = max(0, ifw - back_window)
        icw_max = min(len(headline) - 1, ifw + front_window)
        icws = [ii for ii in range(icw_min, icw_max + 1) if ii != ifw]
        for icw in icws:
            skipgram = (headline[ifw], headline[icw])
            skipgram_counts[skipgram] += 1    
    if iheadline % 200000 == 0:
        print(f'finished {iheadline/len(headlines):.2%} of headlines')
        
print('done')
print('number of skipgrams: {}'.format(len(skipgram_counts)))
print('most common: {}'.format(skipgram_counts.most_common(10)))

finished 0.00% of headlines
done
number of skipgrams: 540478
most common: [(('said', 'unto'), 1956), (('unto', 'said'), 1956), (('thou', 'shalt'), 1583), (('shalt', 'thou'), 1583), (('lord', 'god'), 1452), (('god', 'lord'), 1452), (('unto', 'lord'), 1391), (('lord', 'unto'), 1391), (('ye', 'shall'), 1034), (('shall', 'ye'), 1034)]


In [146]:
row_indxs = []
col_indxs = []
dat_values = []
ii = 0
for (tok1, tok2), sg_count in skipgram_counts.items():
    ii += 1
    if ii % 1000000 == 0:
        print(f'finished {ii/len(skipgram_counts):.2%} of skipgrams')
    tok1_indx = tok2indx[tok1]
    tok2_indx = tok2indx[tok2]
        
    row_indxs.append(tok1_indx)
    col_indxs.append(tok2_indx)
    dat_values.append(sg_count)
    
wwcnt_mat = sparse.csr_matrix((dat_values, (row_indxs, col_indxs)))
print('done')

done


In [147]:
def ww_sim(word, mat, topn=10):
    """Calculate topn most similar words to word"""
    indx = tok2indx[word]
    if isinstance(mat, sparse.csr_matrix):
        v1 = mat.getrow(indx)
    else:
        v1 = mat[indx:indx+1, :]
    sims = cosine_similarity(mat, v1).flatten()
    sindxs = np.argsort(-sims)
    sim_word_scores = [(indx2tok[sindx], sims[sindx]) for sindx in sindxs[0:topn]]
    return sim_word_scores

In [148]:
ww_sim('strike', wwcnt_mat)
# need to also take out ; when we remove stopwords

[('strike', 1.000000000000001),
 ('deliver', 0.49624930880813395),
 ('clap', 0.48922454899632684),
 ('choose', 0.4817866028565171),
 ('prosper', 0.46348003162962714),
 ('require', 0.4590792902106161),
 ('smite', 0.44914148007366783),
 ('kill', 0.44549583486070876),
 ('work', 0.4439734844358474),
 ('neither', 0.44144848287338806)]

In [149]:
wwcnt_norm_mat = normalize(wwcnt_mat, norm='l2', axis=1)

In [150]:
ww_sim('strike', wwcnt_norm_mat)

[('strike', 0.9999999999999983),
 ('deliver', 0.49624930880813284),
 ('clap', 0.4892245489963266),
 ('choose', 0.4817866028565176),
 ('prosper', 0.4634800316296269),
 ('require', 0.45907929021061594),
 ('smite', 0.449141480073669),
 ('kill', 0.445495834860707),
 ('work', 0.4439734844358497),
 ('neither', 0.4414484828733835)]

In [151]:
num_skipgrams = wwcnt_mat.sum()
assert(sum(skipgram_counts.values())==num_skipgrams)

# for creating sparce matrices
row_indxs = []
col_indxs = []

pmi_dat_values = []
ppmi_dat_values = []
spmi_dat_values = []
sppmi_dat_values = []

# smoothing
alpha = 0.75
nca_denom = np.sum(np.array(wwcnt_mat.sum(axis=0)).flatten()**alpha)
sum_over_words = np.array(wwcnt_mat.sum(axis=0)).flatten()
sum_over_words_alpha = sum_over_words**alpha
sum_over_contexts = np.array(wwcnt_mat.sum(axis=1)).flatten()

ii = 0
for (tok1, tok2), sg_count in skipgram_counts.items():
    ii += 1
    if ii % 1000000 == 0:
        print(f'finished {ii/len(skipgram_counts):.2%} of skipgrams')
    tok1_indx = tok2indx[tok1]
    tok2_indx = tok2indx[tok2]
    
    nwc = sg_count
    Pwc = nwc / num_skipgrams
    nw = sum_over_contexts[tok1_indx]
    Pw = nw / num_skipgrams
    nc = sum_over_words[tok2_indx]
    Pc = nc / num_skipgrams
    
    nca = sum_over_words_alpha[tok2_indx]
    Pca = nca / nca_denom
    
    pmi = np.log2(Pwc/(Pw*Pc))
    ppmi = max(pmi, 0)
    
    spmi = np.log2(Pwc/(Pw*Pca))
    sppmi = max(spmi, 0)
    
    row_indxs.append(tok1_indx)
    col_indxs.append(tok2_indx)
    pmi_dat_values.append(pmi)
    ppmi_dat_values.append(ppmi)
    spmi_dat_values.append(spmi)
    sppmi_dat_values.append(sppmi)
        
pmi_mat = sparse.csr_matrix((pmi_dat_values, (row_indxs, col_indxs)))
ppmi_mat = sparse.csr_matrix((ppmi_dat_values, (row_indxs, col_indxs)))
spmi_mat = sparse.csr_matrix((spmi_dat_values, (row_indxs, col_indxs)))
sppmi_mat = sparse.csr_matrix((sppmi_dat_values, (row_indxs, col_indxs)))

print('done')

done


In [152]:
ww_sim('strike', pmi_mat)

[('strike', 1.0),
 ('debts', 0.232354501049311),
 ('lintel', 0.1870855293554879),
 ('executest', 0.17500283452820603),
 ('dip', 0.15606045136161018),
 ('liver', 0.15059712863024532),
 ('skull', 0.14822644213472094),
 ('scoff', 0.14572735139830556),
 ('basin', 0.13964797782398555),
 ('eared', 0.13774900713268992)]

In [153]:
ww_sim('phone', ppmi_mat)

KeyError: 'phone'

In [None]:
ww_sim('phone', spmi_mat)

In [None]:
ww_sim('phone', sppmi_mat)

In [154]:
pmi_use = ppmi_mat
embedding_size = 50
uu, ss, vv = linalg.svds(pmi_use, embedding_size)

In [155]:
print('vocab size: {}'.format(len(unigram_counts)))
print('embedding size: {}'.format(embedding_size))
print('uu.shape: {}'.format(uu.shape))
print('ss.shape: {}'.format(ss.shape))
print('vv.shape: {}'.format(vv.shape))

vocab size: 12603
embedding size: 50
uu.shape: (12603, 50)
ss.shape: (50,)
vv.shape: (50, 12603)


In [156]:
unorm = uu / np.sqrt(np.sum(uu*uu, axis=1, keepdims=True))
vnorm = vv / np.sqrt(np.sum(vv*vv, axis=0, keepdims=True))
#word_vecs = unorm
#word_vecs = vnorm.T
word_vecs = uu + vv.T
word_vecs_norm = word_vecs / np.sqrt(np.sum(word_vecs*word_vecs, axis=1, keepdims=True))

In [157]:
def word_sim_report(word, sim_mat):
    sim_word_scores = ww_sim(word, word_vecs)
    for sim_word, sim_score in sim_word_scores:
        print(sim_word, sim_score)
        word_headlines = [hl for hl in headlines if sim_word in hl and word in hl][0:5]
        for headline in word_headlines:
            print(f'    {headline}')

In [159]:
word = 'smite'
word_sim_report(word, word_vecs)

smite 1.0000000000000002
    ['lord', 'smelled', 'sweet', 'savor', 'lord', 'said', 'heart', 'curse', 'ground', 'mans', 'sake', 'imagination', 'mans', 'heart', 'evil', 'youth', 'neither', 'smite', 'every', 'thing', 'living', 'done']
    ['said', 'esau', 'come', 'one', 'company', 'smite', 'company', 'left', 'shall', 'escape']
    ['deliver', 'pray', 'thee', 'hand', 'brother', 'hand', 'esau', 'fear', 'lest', 'come', 'smite', 'mother', 'children']
    ['stretch', 'hand', 'smite', 'egypt', 'wonders', 'midst', 'thereof', 'let', 'go']
    ['thus', 'saith', 'lord', 'thou', 'shalt', 'know', 'lord', 'behold', 'smite', 'rod', 'mine', 'hand', 'upon', 'waters', 'river', 'shall', 'turned', 'blood']
hindmost 0.7973554165569996
    ['stay', 'ye', 'pursue', 'enemies', 'smite', 'hindmost', 'suffer', 'enter', 'cities', 'lord', 'god', 'hath', 'delivered', 'hand']
sling 0.7117071799931199
slang 0.6954615199123856
enemies 0.6772225861012898
    ['stay', 'ye', 'pursue', 'enemies', 'smite', 'hindmost', 'suffe

In [160]:
word = 'new'
word_sim_report(word, word_vecs)

new 0.9999999999999999
    ['arose', 'new', 'king', 'egypt', 'knew', 'joseph']
    ['even', 'unto', 'morrow', 'seventh', 'sabbath', 'shall', 'ye', 'number', 'fifty', 'days', 'ye', 'shall', 'offer', 'new', 'meat', 'offering', 'unto', 'lord']
    ['ye', 'shall', 'eat', 'old', 'store', 'bring', 'forth', 'old', 'new']
    ['lord', 'make', 'new', 'thing', 'earth', 'open', 'mouth', 'swallow', 'appertain', 'unto', 'go', 'quick', 'pit', 'ye', 'shall', 'understand', 'men', 'provoked', 'lord']
    ['also', 'day', 'firstfruits', 'ye', 'bring', 'new', 'meat', 'offering', 'unto', 'lord', 'weeks', 'ye', 'shall', 'holy', 'convocation', 'ye', 'shall', 'servile', 'work']
agreeth 0.7210477089001187
    ['spake', 'also', 'parable', 'unto', 'man', 'putteth', 'piece', 'new', 'garment', 'upon', 'old', 'otherwise', 'new', 'maketh', 'rent', 'piece', 'taken', 'new', 'agreeth', 'old']
drinkers 0.7139402147718026
    ['awake', 'ye', 'drunkards', 'weep', 'howl', 'ye', 'drinkers', 'wine', 'new', 'wine', 'cut', 'mo

In [163]:
word = 'jesus'
word_sim_report(word, word_vecs)

jesus 1.0
    ['book', 'generation', 'jesus', 'christ', 'son', 'david', 'son', 'abraham']
    ['jacob', 'begat', 'joseph', 'husband', 'mary', 'born', 'jesus', 'called', 'christ']
    ['birth', 'jesus', 'christ', 'wise', 'mother', 'mary', 'espoused', 'joseph', 'came', 'together', 'found', 'child', 'holy', 'ghost']
    ['shall', 'bring', 'forth', 'son', 'thou', 'shalt', 'call', 'name', 'jesus', 'shall', 'save', 'people', 'sins']
    ['knew', 'till', 'brought', 'forth', 'firstborn', 'son', 'called', 'name', 'jesus']
dined 0.9312223866333107
    ['dined', 'jesus', 'saith', 'simon', 'peter', 'simon', 'son', 'jonas', 'lovest', 'thou', 'saith', 'unto', 'yea', 'lord', 'thou', 'knowest', 'love', 'thee', 'saith', 'unto', 'feed', 'lambs']
bishops 0.8788212804443252
    ['paul', 'timotheus', 'servants', 'jesus', 'christ', 'saints', 'christ', 'jesus', 'philippi', 'bishops', 'deacons']
preached 0.814202595573131
    ['jesus', 'answering', 'said', 'unto', 'go', 'way', 'tell', 'john', 'things', 'ye', 

In [165]:
word = 'ass'
word_sim_report(word, word_vecs)

ass 1.0
    ['abraham', 'rose', 'early', 'morning', 'saddled', 'ass', 'took', 'two', 'young', 'men', 'isaac', 'son', 'clave', 'wood', 'burnt', 'offering', 'rose', 'went', 'unto', 'place', 'god', 'told']
    ['abraham', 'said', 'unto', 'young', 'men', 'abide', 'ye', 'ass', 'lad', 'go', 'yonder', 'worship', 'come']
    ['one', 'opened', 'sack', 'give', 'ass', 'provender', 'inn', 'espied', 'money', 'behold', 'sacks', 'mouth']
    ['rent', 'clothes', 'laded', 'every', 'man', 'ass', 'returned', 'city']
    ['issachar', 'strong', 'ass', 'couching', 'two', 'burdens']
ox 0.7423583803655471
    ['thou', 'shalt', 'covet', 'thy', 'neighbors', 'house', 'thou', 'shalt', 'covet', 'thy', 'neighbors', 'wife', 'manservant', 'maidservant', 'ox', 'ass', 'thing', 'thy', 'neighbors']
    ['man', 'shall', 'open', 'pit', 'man', 'shall', 'dig', 'pit', 'cover', 'ox', 'ass', 'fall', 'therein']
    ['theft', 'certainly', 'found', 'hand', 'alive', 'whether', 'ox', 'ass', 'sheep', 'shall', 'restore', 'double']
   

In [166]:
word = 'flesh'
word_sim_report(word, word_vecs)

flesh 1.0
    ['lord', 'god', 'caused', 'deep', 'sleep', 'fall', 'upon', 'adam', 'slept', 'took', 'one', 'ribs', 'closed', 'flesh', 'instead', 'thereof']
    ['adam', 'said', 'bone', 'bones', 'flesh', 'flesh', 'shall', 'called', 'woman', 'taken', 'man']
    ['therefore', 'shall', 'man', 'leave', 'father', 'mother', 'shall', 'cleave', 'unto', 'wife', 'shall', 'one', 'flesh']
    ['lord', 'said', 'spirit', 'shall', 'always', 'strive', 'man', 'also', 'flesh', 'yet', 'days', 'shall', 'hundred', 'twenty', 'years']
    ['god', 'looked', 'upon', 'earth', 'behold', 'corrupt', 'flesh', 'corrupted', 'way', 'upon', 'earth']
flakes 0.9412778021994261
    ['flakes', 'flesh', 'joined', 'together', 'firm', 'cannot', 'moved']
fresher 0.9316456734612926
    ['flesh', 'shall', 'fresher', 'childs', 'shall', 'return', 'days', 'youth']
nourisheth 0.9183523426485174
    ['man', 'ever', 'yet', 'hated', 'flesh', 'nourisheth', 'cherisheth', 'even', 'lord', 'church']
abhorring 0.8924743192012475
    ['shall', '

In [168]:
word = 'love'
word_sim_report(word, word_vecs)

love 1.0
    ['make', 'savory', 'meat', 'love', 'bring', 'may', 'eat', 'soul', 'may', 'bless', 'thee', 'die']
    ['jacob', 'served', 'seven', 'years', 'rachel', 'seemed', 'unto', 'days', 'love']
    ['leah', 'conceived', 'bare', 'son', 'called', 'name', 'reuben', 'said', 'surely', 'lord', 'hath', 'looked', 'upon', 'affliction', 'therefore', 'husband', 'love']
    ['showing', 'mercy', 'unto', 'thousands', 'love', 'keep', 'commandments']
    ['servant', 'shall', 'plainly', 'say', 'love', 'master', 'wife', 'children', 'go', 'free']
marketplaces 0.9622851200592034
    ['said', 'unto', 'doctrine', 'beware', 'scribes', 'love', 'go', 'long', 'clothing', 'love', 'salutations', 'marketplaces']
humbleness 0.7924078096024338
salutations 0.7899960588877815
    ['said', 'unto', 'doctrine', 'beware', 'scribes', 'love', 'go', 'long', 'clothing', 'love', 'salutations', 'marketplaces']
simplicity 0.7783299818745112
    ['long', 'ye', 'simple', 'ones', 'ye', 'love', 'simplicity', 'scorners', 'delight',

In [169]:
def wvec_sim(vec, mat, topn=10):
    """Calculate topn most similar words to vec"""
    v1 = vec
    sims = cosine_similarity(mat, v1).flatten()
    sindxs = np.argsort(-sims)
    sim_word_scores = [(indx2tok[sindx], sims[sindx]) for sindx in sindxs[0:topn]]
    return sim_word_scores

In [102]:
#print(vv[0].reshape(-1,1))

In [103]:
#wvec_sim(sparse.csr_matrix((vv[0], (np.zeros(107509), np.arange(107509))), shape=(1, 107509)), ppmi_mat)

In [170]:
for i in range(0,50):
    print(i)
    print(ss[i])
    print(wvec_sim(sparse.csr_matrix((vv[i], (np.zeros(vocab), np.arange(vocab))), shape=(1, vocab)), ppmi_mat)[0:5])

0
112.89398184681296
[('away', 0.13802234723613468), ('transgressions', 0.11904132637184726), ('name', 0.11874176745826918), ('sitnah', 0.11783123371309137), ('allonbachuth', 0.11783123371309137)]
1
113.17387697156882
[('soul', 0.14352007287453905), ('flesh', 0.12391247599057555), ('thirsteth', 0.11610856359729392), ('fowl', 0.11464664980496493), ('kind', 0.1136912344643238)]
2
113.61194960098817
[('wine', 0.10974497042865963), ('barley', 0.10791234588189536), ('jedidiah', 0.10431826362658897), ('benammi', 0.10372773872345109), ('mine', 0.10302347993610811)]
3
113.9808722131935
[('sons', 0.5299288120574069), ('tabeal', 0.18265690456420555), ('rephah', 0.17716877207863496), ('stripling', 0.15685147159857055), ('stools', 0.14182055621240175)]
4
114.6510067606655
[('spices', 0.15157664425618628), ('pure', 0.12963042048379234), ('myrrh', 0.1278366610047832), ('duke', 0.11509151694801835), ('frankincense', 0.11355873292905734)]
5
114.80653045769505
[('pitched', 0.14423780744058012), ('there

In [171]:
# 46 charged jailed
# 44 interview
# 43 govt
# 41 scientists
#vv[47] + vv[39]
print(wvec_sim(sparse.csr_matrix((vv[41]+vv[46], (np.zeros(vocab), np.arange(vocab))), shape=(1, vocab)), ppmi_mat)[0:10])

[('king', 0.23973073143063667), ('year', 0.22835372167577694), ('shall', 0.20487485495060667), ('day', 0.20353414291768587), ('people', 0.19301579131180807), ('judah', 0.18536744882330478), ('israel', 0.17745001778779065), ('ye', 0.16740570176171218), ('made', 0.15657492957672628), ('land', 0.15657377290061053)]


In [172]:
print(wvec_sim(sparse.csr_matrix((vv[0], (np.zeros(vocab), np.arange(vocab))), shape=(1, vocab)), ppmi_mat)[0:10])
print(wvec_sim(sparse.csr_matrix((vv[1], (np.zeros(vocab), np.arange(vocab))), shape=(1, vocab)), ppmi_mat)[0:10])
print(wvec_sim(sparse.csr_matrix((vv[2], (np.zeros(vocab), np.arange(vocab))), shape=(1, vocab)), ppmi_mat)[0:10])
print(wvec_sim(sparse.csr_matrix((vv[3], (np.zeros(vocab), np.arange(vocab))), shape=(1, vocab)), ppmi_mat)[0:10])

[('away', 0.13802234723613468), ('transgressions', 0.11904132637184726), ('name', 0.11874176745826918), ('sitnah', 0.11783123371309137), ('allonbachuth', 0.11783123371309137), ('jehovahnissi', 0.11783123371309137), ('blot', 0.11382038161742744), ('sins', 0.11197737155823592), ('jedidiah', 0.11121299958621031), ('barjesus', 0.10915346779424842)]
[('soul', 0.14352007287453905), ('flesh', 0.12391247599057555), ('thirsteth', 0.11610856359729392), ('fowl', 0.11464664980496493), ('kind', 0.1136912344643238), ('redeemeth', 0.11187564194015587), ('thy', 0.11109513876194402), ('one', 0.10697432862240851), ('longeth', 0.10512872166189936), ('winged', 0.101997304572593)]
[('wine', 0.10974497042865963), ('barley', 0.10791234588189536), ('jedidiah', 0.10431826362658897), ('benammi', 0.10372773872345109), ('mine', 0.10302347993610811), ('jehovahnissi', 0.09870311847734207), ('allonbachuth', 0.09870311847734207), ('sitnah', 0.09870311847734207), ('injustice', 0.09792061317942782), ('moabites', 0.0958

In [173]:
def ww_sim_print(word, mat, topn=10):
    """Calculate topn most similar words to word"""
    indx = tok2indx[word]
    print(indx)
    if isinstance(mat, sparse.csr_matrix):
        v1 = mat.getrow(indx)
    else:
        v1 = mat[indx:indx+1, :]
    print(type(v1))
    print(v1)
    print(v1.toarray().shape)
    sims = cosine_similarity(mat, v1).flatten()
    sindxs = np.argsort(-sims)
    sim_word_scores = [(indx2tok[sindx], sims[sindx]) for sindx in sindxs[0:topn]]
    return 0 #sim_word_scores

In [174]:
ss

array([112.89398185, 113.17387697, 113.6119496 , 113.98087221,
       114.65100676, 114.80653046, 115.1082777 , 115.17203029,
       115.69801536, 116.09630266, 116.40536136, 117.45229793,
       117.5397044 , 118.4276611 , 118.46308786, 118.88083513,
       119.32286784, 120.10831836, 120.9739527 , 121.55028216,
       123.28595053, 123.8546328 , 125.48956021, 125.95206597,
       127.62690522, 127.92745191, 129.29892671, 129.97378443,
       131.37674896, 132.72931641, 133.18715151, 136.12894132,
       137.29270871, 137.63469347, 138.78576456, 139.88633946,
       143.37360509, 145.10831668, 146.14430666, 153.6910117 ,
       156.59716783, 156.9329888 , 162.84422122, 172.76284316,
       178.12807455, 191.4335822 , 199.389259  , 222.44549782,
       245.10001976, 495.30686879])

In [175]:
def ww_sim(word, mat, topn=10):
    """Calculate topn most similar words to word"""
    indx = tok2indx[word]
    if isinstance(mat, sparse.csr_matrix):
        v1 = mat.getrow(indx)
    else:
        v1 = mat[indx:indx+1, :]
    sims = cosine_similarity(mat, v1).flatten()
    sindxs = np.argsort(-sims)
    sim_word_scores = [(indx2tok[sindx], sims[sindx]) for sindx in sindxs[0:topn]]
    return sim_word_scores

In [176]:
word = 'war'
indx = tok2indx[word]
v1 = ppmi_mat.getrow(indx)
v1

<1x12603 sparse matrix of type '<class 'numpy.float64'>'
	with 352 stored elements in Compressed Sparse Row format>

In [199]:
# word addition
def word_addition(word1, word2, mat, add = 1):
    indx1 = tok2indx[word1]
    v1 = ppmi_mat.getrow(indx1)
    indx2 = tok2indx[word2]
    v2 = ppmi_mat.getrow(indx2)
    #indx3 = tok2indx[word3]
    #v3 = ppmi_mat.getrow(indx3)
    return(wvec_sim(v1 + add * v2, mat))

In [206]:
print(word_addition('speaking', 'tongues', ppmi_mat, 1))

[('tongues', 0.7265415350416136), ('speaking', 0.7013944639027624), ('helps', 0.14783131770679636), ('flatter', 0.13623083829470997), ('speak', 0.12886818917904816), ('unadvisedly', 0.1288591325405886), ('boldly', 0.11825321779421354), ('disperse', 0.11640121750482305), ('gnawed', 0.11634868768153211), ('praying', 0.11446022936815468)]
