# Embeddings

In [1]:
import urllib.request
import zipfile
import numpy as np
import tensorflow as tf

In [2]:
url = 'https://firebasestorage.googleapis.com/v0/b/z2tma61d2a74hya815w9x621uszb3a.appspot.com/o/glove.6B.50d.txt.zip?alt=media&token=eef5e290-ec33-4e50-9c7d-d5d4dbcaa771'
local_zip = 'glove.6B.50d.txt.zip'
urllib.request.urlretrieve(url, local_zip)
zip_ref = zipfile.ZipFile(local_zip, 'r')
zip_ref.extractall()
zip_ref.close()

In [3]:
with open('glove.6B.50d.txt', 'r') as f:
    word_to_vec = {}
    for line in f:
        line = line.strip().split()
        curr_word = line[0]
        word_to_vec[curr_word] = np.array(line[1:], dtype='float32')

word_to_idx = { w:i for i,w in enumerate(list(word_to_vec.keys()))}
idx_to_word = { i:w for i,w in enumerate(list(word_to_vec.keys()))}

In [4]:
word_to_idx

{'the': 0,
 ',': 1,
 '.': 2,
 'of': 3,
 'to': 4,
 'and': 5,
 'in': 6,
 'a': 7,
 '"': 8,
 "'s": 9,
 'for': 10,
 '-': 11,
 'that': 12,
 'on': 13,
 'is': 14,
 'was': 15,
 'said': 16,
 'with': 17,
 'he': 18,
 'as': 19,
 'it': 20,
 'by': 21,
 'at': 22,
 '(': 23,
 ')': 24,
 'from': 25,
 'his': 26,
 "''": 27,
 '``': 28,
 'an': 29,
 'be': 30,
 'has': 31,
 'are': 32,
 'have': 33,
 'but': 34,
 'were': 35,
 'not': 36,
 'this': 37,
 'who': 38,
 'they': 39,
 'had': 40,
 'i': 41,
 'which': 42,
 'will': 43,
 'their': 44,
 ':': 45,
 'or': 46,
 'its': 47,
 'one': 48,
 'after': 49,
 'new': 50,
 'been': 51,
 'also': 52,
 'we': 53,
 'would': 54,
 'two': 55,
 'more': 56,
 "'": 57,
 'first': 58,
 'about': 59,
 'up': 60,
 'when': 61,
 'year': 62,
 'there': 63,
 'all': 64,
 '--': 65,
 'out': 66,
 'she': 67,
 'other': 68,
 'people': 69,
 "n't": 70,
 'her': 71,
 'percent': 72,
 'than': 73,
 'over': 74,
 'into': 75,
 'last': 76,
 'some': 77,
 'government': 78,
 'time': 79,
 '$': 80,
 'you': 81,
 'years': 82,
 'i

In [5]:
idx_to_word

{0: 'the',
 1: ',',
 2: '.',
 3: 'of',
 4: 'to',
 5: 'and',
 6: 'in',
 7: 'a',
 8: '"',
 9: "'s",
 10: 'for',
 11: '-',
 12: 'that',
 13: 'on',
 14: 'is',
 15: 'was',
 16: 'said',
 17: 'with',
 18: 'he',
 19: 'as',
 20: 'it',
 21: 'by',
 22: 'at',
 23: '(',
 24: ')',
 25: 'from',
 26: 'his',
 27: "''",
 28: '``',
 29: 'an',
 30: 'be',
 31: 'has',
 32: 'are',
 33: 'have',
 34: 'but',
 35: 'were',
 36: 'not',
 37: 'this',
 38: 'who',
 39: 'they',
 40: 'had',
 41: 'i',
 42: 'which',
 43: 'will',
 44: 'their',
 45: ':',
 46: 'or',
 47: 'its',
 48: 'one',
 49: 'after',
 50: 'new',
 51: 'been',
 52: 'also',
 53: 'we',
 54: 'would',
 55: 'two',
 56: 'more',
 57: "'",
 58: 'first',
 59: 'about',
 60: 'up',
 61: 'when',
 62: 'year',
 63: 'there',
 64: 'all',
 65: '--',
 66: 'out',
 67: 'she',
 68: 'other',
 69: 'people',
 70: "n't",
 71: 'her',
 72: 'percent',
 73: 'than',
 74: 'over',
 75: 'into',
 76: 'last',
 77: 'some',
 78: 'government',
 79: 'time',
 80: '$',
 81: 'you',
 82: 'years',
 83

In [6]:
embedding_matrix = np.array(list(word_to_vec.values()))
embedding_matrix.shape

(400000, 50)

In [7]:
def CosineSimilarityModel(params):
    embedding_matrix = params['embedding_matrix']
    f1 = tf.keras.layers.Embedding(input_dim=embedding_matrix.shape[0], 
                                   output_dim=embedding_matrix.shape[1], 
                                   name='embedding')
    f2 = tf.keras.layers.Flatten()
    f3 = tf.keras.layers.Dot(axes=1)
    f4 = tf.keras.layers.Lambda(lambda x: x[0]/(tf.norm(x[1], axis=1)*tf.norm(x[2], axis=1)))
    x0 = tf.keras.Input(shape=(1))
    z0 = tf.keras.Input(shape=(1))
    x1 = f1(x0)
    z1 = f1(z0)
    x2 = f2(x1)
    z2 = f2(z1)
    y1 = f3([x2,z2]) 
    y2 = f4([y1,x2,z2])
    model = tf.keras.Model([x0,z0], y2) 
    model.get_layer(name='embedding').set_weights([embedding_matrix])
    return model

In [8]:
model = CosineSimilarityModel({'embedding_matrix': embedding_matrix})

In [9]:
word1 = np.array([[word_to_idx['father']]])
word2 = np.array([[word_to_idx['mother']]])

model.predict([word1,word2])

array([[0.89090383]], dtype=float32)

In [10]:
word3 = np.array([[word_to_idx['ball']]])
word4 = np.array([[word_to_idx['crocodile']]])

model.predict([word3,word4])

array([[0.27439246]], dtype=float32)