In [32]:
import numpy as np
import pandas as pd
import tensorflow as tf
import pydot, graphviz
import nltk
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords
import re
import random
import tqdm
# from sg_ns_udf import *

In [26]:
# import stopword from nltk

nltk.download('stopwords')
stopWords = set(stopwords.words('english'))

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [141]:
# list of user defined functions

# stopword removal
def remove_stop_word(word_list, stopWords=stopWords):
    wordsFiltered = []
    for w in word_list.split():
        if w not in stopWords:
            wordsFiltered.append(w)
    return(" ".join(wordsFiltered))



# word subsampling to include words in the final document
def subsample_words(words, word_counts):
    sample = 0.001
    new_words = []
    for i in words:
        frac = word_counts[i]/sum(word_counts.values())
        prob = (np.sqrt(frac/sample) + 1) * (sample/frac)
        
        if np.random.random() < prob:
            new_words.append(i)
    return(new_words)



# create target (unigram) & context (unigram) pair with specified window size
def x_y_list(corpus, window_size, vocab):
    target = []
    context = []
    pair = []
    v = []
    for i in range(len(corpus)):
        if (i-window_size)<0:
            start_point = 0
        else:
            start_point = (i-window_size)

        j=start_point
        while j<=(i+window_size):
            if (j!=i) & (j<len(corpus)):
                target += [corpus[i]]
                context += [corpus[j]]
                pair.append([word_to_index[corpus[i]], word_to_index[corpus[j]]])
            j+=1
    return(target, context, pair)
    

# methods of selecting negative sampling with and without probabilistic distribution
# method1
def neg_sample_selector(wrd, ns_list, num_ns, sel_prob, num_times):
    res_sample = []
    for _ in tqdm.tqdm(range(num_times)):
        word, res_tmp = [wrd]*num_ns, np.random.choice(list(ns_list), num_ns, replace=False, p=sel_prob)
        word_ns = np.vstack((word, res_tmp)).T.tolist()
        res_sample += word_ns        
    return(res_sample)


# method2
def neg_sample_selector2(wrd, ns_list, num_ns, sel_prob, num_times):
    res_sample = []
    word, res_tmp = [wrd]*(num_ns*num_times), np.random.choice(list(ns_list), num_ns*num_times, replace=True, p=sel_prob)
    word_ns = np.vstack((word, res_tmp)).T.tolist()
    res_sample += word_ns        
    return(res_sample)


# method3
def neg_sample_selector_wo_prob(wrd, ns_list, num_ns, num_times):
    res_sample = []
    for _ in tqdm.tqdm(range(num_times)):
        word, res_tmp = [wrd]*num_ns, np.random.choice(list(ns_list), num_ns, replace=False)
        word_ns = np.vstack((word, res_tmp)).T.tolist()
        res_sample += word_ns        
    return(res_sample)


# method4
def neg_sample_selector_wo_prob2(wrd, ns_list, num_ns, num_times):
    res_sample = []
    word, res_tmp = [wrd]*(num_ns*num_times), np.random.choice(list(ns_list), num_ns*num_times, replace=True)
    word_ns = np.vstack((word, res_tmp)).T.tolist()
    res_sample += word_ns        
    return(res_sample)



# calculating and finding top-n word similarity
def top_similar_words(target_word, word_to_index, similar_words, top_n):
    if target_word not in word_to_index.keys():
        return(print("target word : <", target_word, "> not present in the vocabulary"))
    else:
        target_word_index = word_to_index[target_word]
        top_similar_idx = (-similar_words[target_word_index,:]).argsort()[1:(top_n+1)]
        index_to_word = {i[1]: i[0] for i in word_to_index.items()}
    return([(index_to_word[ii], similar_words[target_word_index,ii]) for ii in top_similar_idx])



# calculating and finding top-n word dissimilarity
def top_dissimilar_words(target_word, word_to_index, similar_words, top_n):
    if target_word not in word_to_index.keys():
        return(print("target word : <", target_word, "> not present in the vocabulary"))
    else:
        target_word_index = word_to_index[target_word]
        top_similar_idx = (similar_words[target_word_index,:]).argsort()[1:(top_n+1)]
        index_to_word = {i[1]: i[0] for i in word_to_index.items()}
    return([(index_to_word[ii], similar_words[target_word_index,ii]) for ii in top_similar_idx])



# create sentence embedding
def convert_sentence_to_vector(sentence, word_vect):
    sent_vect = np.array([word_vect[sent_token] for sent_token in sentence.split()]).mean(axis=0)
    return(sent_vect)


# find word pair analogy
def find_pair_analogy(positive_word, negative_word, learned_embeddings, word_to_index, top_n):
    
    p1_embd = learned_embeddings[word_to_index[positive_word[0]]]
    p2_embd = learned_embeddings[word_to_index[positive_word[1]]]
    n1_embd = learned_embeddings[word_to_index[negative_word[0]]]

    calc_embd = n1_embd - p1_embd + p2_embd

    learned_embeddings_concat = np.vstack([calc_embd, learned_embeddings])
    similar_words_concat = cosine_similarity(learned_embeddings_concat)
    similar_word = similar_words_concat[0][1:]

    topn_similar_word_idx = np.argsort(similar_word)[::-1][1:(top_n+1)]
    index_to_word = {i[1]: i[0] for i in word_to_index.items()}

    return([(index_to_word[ii], similar_word[ii]) for ii in topn_similar_word_idx])



# finding word compositionality
def find_compositionality(word_list, learned_embeddings, word_to_index, top_n):
    
    calc_embd = np.array([learned_embeddings[word_to_index[i]] for i in word_list]).sum(axis=0)
    
    learned_embeddings_concat = np.vstack([calc_embd, learned_embeddings])
    similar_words_concat = cosine_similarity(learned_embeddings_concat)
    similar_word = similar_words_concat[0][1:]

    topn_similar_word_idx = np.argsort(similar_word)[::-1][1:(top_n+1)]
    index_to_word = {i[1]: i[0] for i in word_to_index.items()}

    return([(index_to_word[ii], similar_word[ii]) for ii in topn_similar_word_idx])

In [8]:
loaded_model = tf.keras.models.load_model("trained_interview_ds_selectmin5_epoch5_batch128_w2v_ns.h5")

In [9]:
with open('final_pos_ns_list_interview_ds_selectmin5.npy', 'rb') as f:
    pos_ns_list = np.load(f)

In [10]:
with open('final_pos_ns_label_interview_ds_selectmin5.npy', 'rb') as f:
    pos_ns_label = np.load(f)

In [11]:
# create train test data due to the large data and to avoid risk of out of memory error

from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(pos_ns_list, pos_ns_label, test_size=0.3, random_state=42)

In [12]:
# prediction on test data to validate the accuracy

model_predict = loaded_model.predict([x_test[:, 0], x_test[:, 1]])
model_predict_class = [1 if i>=0.5 else 0 for i in model_predict]



In [13]:
# calculate confusion matrix on test data

from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, model_predict_class)
print(cm)

[[8492290   43916]
 [1067839 3200538]]


In [15]:
# calculate test data accuracy, precision, recall and f1 score (based on 0.5 as threshold)

accuracy, precison, recall = (cm[0][0]+cm[1][1])/y_test.shape[0], cm[1][1]/np.sum(cm[:,1]), cm[1][1]/np.sum(cm[1,:])
f1 = (2*precison*recall)/(precison+recall)

In [16]:
print(accuracy, precison, recall, f1)

0.9131752279633003 0.9864642864408002 0.74982551916103 0.8520191656114718


In [17]:
# learned skipgram + ns embedding for every word
learned_embeddings = loaded_model.layers[2].get_weights()[0]

In [18]:
with open('word_to_index_interview_ds_selectmin5.npy', 'rb') as f:
    word_to_index_array = np.load(f)

In [20]:
word_to_index = {i[0]: int(i[1]) for i in word_to_index_array}

In [22]:
# print the learned embeddings

for word, idx in word_to_index.items():
    print(f"{word}: {learned_embeddings[idx]}")

belts: [ -5.6703825   -0.4717555    4.34598     -4.1609316    0.8184342
   5.1119175   -7.7497168   -7.070394    -6.3832307    5.2494206
  -5.115445    -5.0594044    4.916483    -2.7349243   -3.3854957
  -4.1673713   -5.645251    10.156699    -5.34902      5.129325
   3.9195998   -7.5546756   -6.233095     6.066203     3.0762756
  -6.152612     6.824287    -3.9384816    2.5650334    8.862098
   7.248766     5.0079074    6.6303673    7.388708     6.4378443
  -4.7945557   -7.105584     7.1720133    3.006107    -0.519735
   5.5187693   -6.911243     5.1235843    8.361284    -5.5082836
   7.5417275    7.6313524    9.011724     6.076698    -4.55737
   8.522577    -3.8411024    7.513193     0.02006679   4.4576883
  -6.000041     5.5881796   -3.6056447    7.778322    -2.9077163
  -6.094511    -9.363558    -5.394372     7.804233     8.578741
   4.4215965    8.723104    -8.372889     6.0628595   -4.698974
  -7.9081736   -5.850054    11.370805     6.0128207    5.512064
  -7.918045     6.957081  

IOPub data rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_data_rate_limit`.

Current values:
ServerApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
ServerApp.rate_limit_window=3.0 (secs)



frustration: [ -5.9548697   -6.797942     5.7735543   -5.5977664   -3.1579733
   6.940511    -7.798283    -5.052439    -8.153335     4.263028
  -4.3000374  -10.061233     6.494882    -5.9435434   -2.5760767
  -4.4371314   -3.7323005    8.594753    -5.025368     4.4668574
   5.7604184   -3.6882615   -3.0668988    5.6933284    5.9445524
  -3.7785084    8.183926    -3.2588782    5.048309     2.0767744
   5.430664     6.8304143    7.0376525    4.7335167    6.536848
  -3.0203123   -8.186757     7.9686093    5.1113176    0.74660295
   6.5716543   -5.542118     7.2824655    6.44972     -7.9728026
   4.814668     8.280836     5.3569365    2.7243423   -4.1994343
   9.093799    -7.249716     7.4743085   -1.4514183    8.085657
  -1.7365428    3.1589115   -6.798127     6.8333592   -1.4320652
  -4.7329016   -8.558955    -4.4873514    7.771617     7.346297
   7.9651594    4.4997606   -4.961761     7.3849916   -2.8096542
  -9.654478    -5.0319014    8.7926445    6.614987     3.081375
  -5.684335     

In [34]:
# find similar words by computing cosine similarity between embeddings

from sklearn.metrics.pairwise import cosine_similarity
similar_words = cosine_similarity(learned_embeddings)

In [230]:
# printing top-n similar words for the given target word

top_similar_words(target_word="cricket", word_to_index=word_to_index, similar_words=similar_words, top_n=10)

[('billy', 0.91405237),
 ('lizard', 0.9099143),
 ('leopard', 0.9061944),
 ('tweety', 0.9057031),
 ('insect', 0.9042704),
 ('whiskers', 0.90252763),
 ('bear', 0.9016362),
 ('mittens', 0.9009222),
 ('birdie', 0.9008873),
 ('johnny', 0.89974236)]

In [238]:
top_similar_words(target_word="happy", word_to_index=word_to_index, similar_words=similar_words, top_n=10)

[('hated', 0.76510096),
 ('digging', 0.764146),
 ('counter', 0.7638685),
 ('unhappy', 0.7635076),
 ('kneel', 0.7591511),
 ('wondering', 0.75847447),
 ('slippery', 0.7572594),
 ('struggling', 0.7571649),
 ('pop', 0.75458),
 ('hadnt', 0.752349)]

In [228]:
top_similar_words(target_word="vegetable", word_to_index=word_to_index, similar_words=similar_words, top_n=10)

[('streets', 0.91195935),
 ('lettuce', 0.9106061),
 ('woolys', 0.9083339),
 ('beads', 0.9074483),
 ('cupcake', 0.9071183),
 ('sweeter', 0.90677655),
 ('oat', 0.9067259),
 ('poster', 0.9066619),
 ('chin', 0.9045676),
 ('limeade', 0.90328896)]

In [231]:
top_similar_words(target_word="singing", word_to_index=word_to_index, similar_words=similar_words, top_n=10)

[('whistle', 0.8601067),
 ('princess', 0.85469615),
 ('doctor', 0.84928393),
 ('snake', 0.8464158),
 ('light', 0.83841074),
 ('star', 0.8361299),
 ('building', 0.8345922),
 ('baby', 0.8339813),
 ('purple', 0.8337134),
 ('drum', 0.83364296)]

In [239]:
top_similar_words(target_word="tidy", word_to_index=word_to_index, similar_words=similar_words, top_n=10)

[('organized', 0.8524425),
 ('magnet', 0.84888655),
 ('silly', 0.8443268),
 ('horse', 0.8418251),
 ('cop', 0.8384284),
 ('card', 0.8365329),
 ('dance', 0.8364218),
 ('expensive', 0.8362795),
 ('mouse', 0.8332277),
 ('suit', 0.83147407)]

In [241]:
# top_dissimilar_words(target_word="tidy", word_to_index=word_to_index, similar_words=similar_words, top_n=10)

In [41]:
# word_to_index

In [254]:
# finding word pair analogy like, p1:n1 :: p2:n2 ==> p1-n1 = p2-n2 ==> n2 = n1 - p1 + p2

p1 = "shrimps"
n1 = "fish"

p2 = "apple"

top_n = 10

find_pair_analogy(positive_word=[p1, p2], negative_word=[n1], learned_embeddings=learned_embeddings, word_to_index=word_to_index, top_n=top_n)

[('fish', 0.6191074),
 ('bee', 0.6109871),
 ('gum', 0.610412),
 ('fruit', 0.61028224),
 ('candy', 0.6074096),
 ('chair', 0.60606855),
 ('stars', 0.60151815),
 ('star', 0.6002015),
 ('bone', 0.59966826),
 ('dragon', 0.59946877)]

In [257]:
p1 = "girl"
n1 = "princess"

p2 = "mouse"

top_n = 10

find_pair_analogy(positive_word=[p1, p2], negative_word=[n1], learned_embeddings=learned_embeddings, word_to_index=word_to_index, top_n=top_n)

[('mouse', 0.75377405),
 ('star', 0.7376022),
 ('puppy', 0.735086),
 ('animal', 0.73029566),
 ('bug', 0.728527),
 ('whiskers', 0.72752964),
 ('balloon', 0.7272005),
 ('lizard', 0.7266013),
 ('boat', 0.7265966),
 ('snake', 0.7260296)]

In [251]:
# finding word compositionality

word_list_compo = ["singing","doctor"]

top_n = 10

find_compositionality(word_list=word_list_compo, learned_embeddings=learned_embeddings, word_to_index=word_to_index, top_n=top_n)

[('doctor', 0.95785004),
 ('snake', 0.9167478),
 ('whistle', 0.9075078),
 ('fluffy', 0.90654564),
 ('baby', 0.9050132),
 ('mouse', 0.902864),
 ('frog', 0.9018653),
 ('dream', 0.9010649),
 ('light', 0.90003645),
 ('star', 0.89977384)]

In [252]:
word_list_compo = ["snake","human"]

top_n = 10

find_compositionality(word_list=word_list_compo, learned_embeddings=learned_embeddings, word_to_index=word_to_index, top_n=top_n)

[('snake', 0.95356536),
 ('fluffy', 0.9430057),
 ('mouse', 0.9428352),
 ('bug', 0.9418129),
 ('bella', 0.94093025),
 ('fairy', 0.9371683),
 ('cloud', 0.9365196),
 ('billy', 0.93536),
 ('frog', 0.9330271),
 ('flower', 0.93187666)]

In [187]:
# approximate nearest neighbor based similar sentences

In [188]:
# create sentence embedding
def convert_sentence_to_vector(sentence, word_vect):
    sent_vect = np.array([word_vect[sent_token] for sent_token in sentence.split()]).mean(axis=0)
    return(sent_vect)

In [189]:
with open('sentence_embeddings_interview_ds_selectmin5_epoch5_batch128_w2v_ns.npy', 'rb') as f:
    sentence_embedding = np.load(f)

In [190]:
with open('sent_to_idx_interview_ds_selectmin5.npy', 'rb') as f:
    sent_to_idx = np.load(f)

In [191]:
# find similar words by computing cosine similarity between embeddings

from sklearn.metrics.pairwise import cosine_similarity
# similar_sentence = cosine_similarity(sentence_embedding[:50000])

In [192]:
sentence_embedding[:1000]

array([[-4.9206696, -4.1714416,  4.450413 , ..., -3.0312583, -3.381087 ,
         2.873713 ],
       [-3.3892214, -4.3644347,  4.3068824, ..., -3.6262753, -4.3573194,
         1.6197315],
       [-3.9554696, -4.7291026,  4.010808 , ..., -2.2113929, -2.7102191,
         2.9990237],
       ...,
       [-4.493719 , -4.734232 ,  4.704386 , ..., -1.47854  , -3.4117866,
         3.6016397],
       [-5.468516 , -5.6034317,  3.3719501, ..., -3.6693819, -3.195319 ,
         1.0373479],
       [-3.7609012, -3.1538892,  6.087875 , ..., -2.8975778, -2.612925 ,
         2.7454274]], dtype=float32)

In [205]:
sent_to_idx[:50]

array([['spot playing yard threw brick spot ran get spot brought brick back accidentally dropped tims foot hurt lot started cry',
        '0'],
       ['sara lazy girl like homework help liked watch tv toys diary wrote dreams hid diary bed could see',
        '1'],
       ['owner toy car jack got upset started cry lily felt bad know told pray ask forgiveness lily closed eyes prayed jack forgive',
        '2'],
       ['watched man heard loud noise tree fell cable snapped cable flew air came towards sam lily',
        '3'],
       ['later lily felt tired wanted take nap went room closed eyes woke something strange happened gift gone looked everywhere couldnt find',
        '4'],
       ['spot looked everywhere magazine found big tree picked ran back happy magazine back gave spot big hug became best played together every dayonce comet loved fly fast competitive always wanted best liked race stars see could farthest sky',
        '5'],
       ['came sue trying climb big chair smiled said 

In [194]:
from annoy import AnnoyIndex

In [195]:
# word_to_index

In [196]:
sent_vectors_annoy = {i[0]: sentence_embedding[int(i[1])] for i in sent_to_idx}

In [262]:
num_dimensions = len(next(iter(sent_vectors_annoy.values())))
annoy_index = AnnoyIndex(num_dimensions, 'euclidean')

# Insert word vectors into the Annoy index
for i, (sent_annoy, vector_annoy) in enumerate(sent_vectors_annoy.items()):
    annoy_index.add_item(i, vector_annoy)

annoy_index.build(n_trees=20)

True

In [263]:
sent_to_idx[0][0]

'spot playing yard threw brick spot ran get spot brought brick back accidentally dropped tims foot hurt lot started cry'

In [264]:
# txt = 'spot playing yard threw brick spot ran get spot brought brick back accidentally dropped tims foot hurt lot started cry'

# txt = 'suddenly heard loud noise squirrel running towards squirrel looked nervous scared lily know happening followed squirrel fire forest quickly called firemen helped put fire'
# txt = 'smiled said thats great'
# txt = 'ben youre ok said youre ok'

In [265]:
# similar_indices

In [273]:
txt = "moral story kind others even kind"

# Query for similar vectors
query_vector = sent_vectors_annoy[txt]
similar_indices = annoy_index.get_nns_by_vector(query_vector, n=5)

# Get the words corresponding to the similar indices
similar_sent_annoy = [list(sent_vectors_annoy.keys())[i]+"  " for i in similar_indices][1:]

# Print the similar words
print("sentence similar to <<\033[1m"+txt+"\033[0m>> :\n\n", similar_sent_annoy)

sentence similar to <<[1mmoral story kind others even kind[0m>> :

 ['moral story important kind share others even dont lot rich doesnt mean lot money also means kind heart  ', 'moral story better kind helpful fast  ', 'moral story kind helpful important strong  ', 'frog answered yes talk kind animals  ']


In [272]:
txt = 'suddenly heard loud noise squirrel running towards squirrel looked nervous scared lily know happening followed squirrel fire forest quickly called firemen helped put fire'

# Query for similar vectors
query_vector = sent_vectors_annoy[txt]
similar_indices = annoy_index.get_nns_by_vector(query_vector, n=5)

# Get the words corresponding to the similar indices
similar_sent_annoy = [list(sent_vectors_annoy.keys())[i]+"  " for i in similar_indices][1:]

# Print the similar words
print("sentence similar to <<\033[1m"+txt+"\033[0m>> :\n\n", similar_sent_annoy)

sentence similar to <<[1msuddenly heard loud noise squirrel running towards squirrel looked nervous scared lily know happening followed squirrel fire forest quickly called firemen helped put fire[0m>> :

 ['suddenly heard loud noise scary monster coming towards lily scared know remembered toy bow arrow quickly grabbed aimed monster shot arrow hit monster right eye monster fell move anymore  ', 'mouse helped whiskers settle tried push hole suddenly heard loud noise ground shook big truck crashed tree tree fell top hole whiskers mouse safe inside hole  ', 'suddenly heard loud noise scary lion lion roared chased harry quickly ran hid behind big rock lily scared harry told worry plan harry started sing silly song dance around lion confused stopped chasing  ', 'suddenly airplane started shake pilot scared tried stay calm pointed airplane landed safely ground passengers happy thanked pilot keeping safeonce light brown rabbit rabbit liked hop big green field rabbit big ears small tail  ']


In [275]:
txt = 'ben youre ok said youre ok'

# Query for similar vectors
query_vector = sent_vectors_annoy[txt]
similar_indices = annoy_index.get_nns_by_vector(query_vector, n=5)

# Get the words corresponding to the similar indices
similar_sent_annoy = [list(sent_vectors_annoy.keys())[i]+"  " for i in similar_indices][1:]

# Print the similar words
print("sentence similar to <<\033[1m"+txt+"\033[0m>> :\n\n", similar_sent_annoy)

sentence similar to <<[1mben youre ok said youre ok[0m>> :

 ['ok said ben important hat im glad hes ok  ', 'ok ben said read story first  ', 'ok max said ready set  ', 'ok ben said quiet maybe someone sleeping  ']
