In [1]:
import numpy as np
import torch.nn as nn
import json

In [2]:
def glove_dict(glove_file):
    words = []
    idx = 0
    word2idx = {}
    vectors = []

    with open(glove_file, 'rb') as f:
        for l in f:
            line = l.decode().split()
            word = line[0]
            words.append(word)
            word2idx[word] = idx
            idx += 1
            vect = np.array(line[1:]).astype(float)
            vectors.append(vect)
    
    glove = {w: vectors[word2idx[w]] for w in words}
    return glove

def glove_matrix(voc, glove):
    matrix_len = len(voc)
    weights_matrix = np.zeros((matrix_len, 50))
    words_found = 0

    for i, word in enumerate(voc):
        try: 
            weights_matrix[i] = glove[word]
            words_found += 1
        except KeyError:
            weights_matrix[i] = np.random.normal(scale=0.6, size=50)

    return weights_matrix

def create_emb_layer(weights_matrix, non_trainable=False):
    num_embeddings, embedding_dim = weights_matrix.size()
    emb_layer = nn.Embedding(num_embeddings, embedding_dim)
    emb_layer.load_state_dict({'weight': weights_matrix})
    if non_trainable:
        emb_layer.weight.requires_grad = False

    return emb_layer, num_embeddings, embedding_dim

In [3]:
file = json.load(open("buzztrain.json"))

In [4]:
buzztrain = json.load(open("buzztrain.json"))
vocab = []

for q in buzztrain["buzzer_data"]:
    x = q["guess"].encode('utf8')
    if x not in vocab:
        vocab.append(x)

len(vocab)

24453

In [5]:
glove = glove_dict('glove.6B.50d.txt')

In [25]:
def guess_embed(guess, glove_dict):
    # embed = np.zeros(50)
    # try:
    #     embed = np.sum([glove_dict[x] for x in guess.decode('utf8').lower().split('_')], 0)
    # except KeyError:
    #     embed = np.random.normal(scale=0.6, size=50)
    gv = [glove_dict[x] for x in guess.decode('utf8').lower().split('_')]
    return sum(gv)/len(gv)

In [7]:
vocab[12843]

b'Andr\xc3\xa9_Brink'

In [29]:
print(vocab[12860])
guess_embed(vocab[12860], glove)

b'Centre_for_Research_on_Energy_Security'


array([ 3.27364500e-01,  4.04440367e-01,  1.21126000e-01,  2.66280833e-01,
        1.08242000e-01, -1.45657667e-01, -4.87428333e-01, -6.22336667e-01,
        5.69504500e-01, -2.55036833e-01,  1.97910833e-01, -8.17066667e-02,
       -3.59115167e-01,  9.16410833e-02, -2.39325000e-01,  2.86368333e-01,
        3.03854000e-01,  1.37301667e-01, -3.76765000e-01, -9.69260000e-02,
        7.22850000e-01,  2.83131667e-01, -1.40120000e-01, -3.64131667e-01,
       -2.24475167e-01, -1.43282167e+00,  2.61833833e-01, -1.53319167e-01,
       -3.19189667e-01,  2.82802333e-01,  3.43825000e+00,  1.46570000e-01,
       -3.20226667e-01, -9.33691667e-01, -2.13257533e-01,  5.51350000e-02,
       -1.01866667e-02,  1.59596667e-01,  4.00389000e-01,  1.74630167e-01,
       -1.18683333e-03,  2.15686667e-02,  1.19681667e-01, -1.21122667e-01,
        1.10844333e-01,  1.51705833e-01, -1.62081167e-01,  4.92278333e-01,
        1.39504167e-01,  1.00340000e-01])

In [18]:
errors = 0

for a in vocab:
    try:
        guess_embed(a, glove)
    except KeyError:
        errors += 1

In [19]:
errors

3047

In [20]:
len(vocab)

24453

In [21]:
len(glove)

400001