## SETUP

In [18]:
import requests
import gzip
import zipfile
import os
from os import path
import numpy as np
import matplotlib.pyplot as plt
import copy
import re
import torch 
import io
import time 
import pickle
from collections import defaultdict
from datetime import datetime

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Device used: ", device)

Device used:  cuda


In [2]:
from google.colab import drive
drive.mount('/content/drive')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········
Mounted at /content/drive


In [0]:
# if you want, you can use the below script to automatically find where the directory is in your GDrive and cd into it.
# %cd '/content/drive'
# directory_name = !find . -type d -name "ucl-nlp-finalproject"
# directory_name = directory_name[0]
# print(directory_name)

In [3]:
%cd 'drive/My Drive/Colab Notebooks'

/content/drive/My Drive/Colab Notebooks


In [21]:
EXPERIMENT_FOLDER_NAME = ""
assert len(EXPERIMENT_FOLDER_NAME) != 0

AssertionError: ignored

In [0]:
## Load embeddings from .npy file and load W2V_dict (word to id) from pickle file
W2V_weights = np.load('embeddings.npy')
with open('word2index.pickle', 'rb') as handle:
  W2V_dict = pickle.load(handle)
# with open('word_freq.pickle', 'rb') as handle2:
#   W2freq_dict = pickle.load(handle2)

In [0]:
# ## Sort by frequency to get top RESTRICT_NUM tokens -- standard seems to be top 300K.
# RESTRICT_NUM = 300000

# ordered_freq_tuple_list = [(token, freq) for token, freq in zip(W2freq_dict.keys(), W2freq_dict.values())] # returns list of tuples [(token, freq)]
# ordered_freq_tuple_list = sorted(ordered_freq_tuple_list, key = lambda x: x[1], reverse=True) # sorts the above by the frequency in descending order 
# restricted_freq_tuple_list = ordered_freq_tuple_list[:RESTRICT_NUM] # grab first RESTRICT_NUM tokens

# # renumber the tokens
# W2V_dict = dict([(x[0], i) for i, x in enumerate(restricted_freq_tuple_list)])

# # grab embeddings in this order 
# indices_to_grab = [original_W2V_dict[token] for token, _ in restricted_freq_tuple_list]
# # create new matrix now 
# W2V_weights = original_W2V_weights[indices_to_grab]

# if original_W2V_weights.shape[0] < RESTRICT_NUM:
#   assert W2V_weights.shape[0] == original_W2V_weights.shape[0]
# else: 
#   assert W2V_weights.shape[0] == RESTRICT_NUM

### ANALOGIES

In [0]:
# Credit to Facebook MUSE repo for base word analogy code -- didn't think to use matrix mult of normalized embeds to do cosine-distance nearest neighbor search, clever!
# I added gross CUDA code and dynamic sizing of matrices during batching as to not exceed Colab GPU memory allocation 

# IMPORTANT -- whether the model has embeddings only for lowercase. If so, lower should be True. If model has embeddings for uppercase (i.e. proper nouns), should be False.

def get_word_id(word, word2id, lower):
    """
    Get a word ID.
    If the model does not use lowercase and the evaluation file is lowercased,
    we might be able to find an associated word.
    """
    assert type(lower) is bool
    word_id = word2id.get(word)
    if word_id is None and not lower:
        word_id = word2id.get(word.capitalize())
    if word_id is None and not lower:
        word_id = word2id.get(word.title())
    return word_id

def get_wordanalogy_scores(dirpath, word2id, embeddings, lower):
    """
    Return (english) word analogy score
    """
    if not os.path.isdir(dirpath):
        return None

    # normalize word embeddings
    embeddings = embeddings / np.sqrt((embeddings ** 2).sum(1))[:, None]

    # scores by category
    scores = defaultdict(dict)

    word_ids = {}
    queries = {}

    num_examples_thrown = 0
    total_examples = 0
    with io.open(os.path.join(dirpath, 'questions-words.txt'), 'r', encoding='utf-8') as f:
        for line in f:
            # new line
            line = line.rstrip()
            if lower:
                line = line.lower()

            # new category
            if ":" in line:
                assert line[1] == ' '
                category = line[2:]
                assert category not in scores
                scores[category] = {'n_found': 0, 'n_not_found': 0, 'n_correct': 0}
                word_ids[category] = []
                queries[category] = []
                continue

            # get word IDs
            assert len(line.split()) == 4, line
            word1, word2, word3, word4 = line.split()

            word_id1 = get_word_id(word1, word2id, lower)
            word_id2 = get_word_id(word2, word2id, lower)
            word_id3 = get_word_id(word3, word2id, lower)
            word_id4 = get_word_id(word4, word2id, lower)

            # if at least one word is not found
            if any(x is None for x in [word_id1, word_id2, word_id3, word_id4]):
                scores[category]['n_not_found'] += 1
                num_examples_thrown +=1 
                continue
            else:
                scores[category]['n_found'] += 1
                word_ids[category].append([word_id1, word_id2, word_id3, word_id4])
                # generate query vector and get nearest neighbors
                query = embeddings[word_id1] - embeddings[word_id2] + embeddings[word_id4]
                query = query / np.linalg.norm(query)

                queries[category].append(query)
            total_examples += 1
    
    print("Done scanning, threw {} examples out of {} total = {} % discarded".format(num_examples_thrown, total_examples, float(num_examples_thrown)/total_examples * 100))
    
    # Compute score for each category
    total_cats = len(queries)
    curr_cat = 0
    ROW_LIMIT = 500

    with torch.no_grad(): # make sure to not store computational graph info
      for cat in queries:

          start_time = time.time()

          qs_np = np.vstack(queries[cat])
          qs_shape = qs_np.shape
          
          for i in range(0, qs_shape[0], ROW_LIMIT): #allocate matrices of size ROW LIMIT rows 
            if i >= ROW_LIMIT:
              total_cats += 1 
            qs = torch.from_numpy(qs_np[i:i + ROW_LIMIT, :]).cuda()
            keys = torch.from_numpy(embeddings.T).cuda()
            values = qs.mm(keys)

            # free up memory 
            del qs
            del keys 
            torch.cuda.empty_cache()

            word_ids_tensor = torch.tensor(word_ids[cat]).cuda()
            curr_word_ids_tensor = word_ids_tensor[i:i+ROW_LIMIT, :]

            # be sure we do not select input words
            for j, ws in enumerate(curr_word_ids_tensor):
                for wid in [ws[0], ws[1], ws[3]]:
                    values[j, wid] = -1e9
            maxes, indices = values.max(axis = 1)
            correct_indices = curr_word_ids_tensor[:, 2]
            num_correct = torch.sum(torch.eq(indices, correct_indices)).item()
            key = cat + "_{}".format(str(round(i/(ROW_LIMIT))))
            scores[key]['n_correct'] = num_correct

            curr_cat +=1 
            print('finished batch {} out of {}, took {} seconds'.format(curr_cat, total_cats, time.time() - start_time))
            
            # clean up memory
            del values 
            del word_ids_tensor
            del maxes
            del indices
            del correct_indices 
            torch.cuda.empty_cache()
            # print("Current CUDA snapshot after del and empty cache at end of loop", torch.cuda.memory_allocated())

    # compute and log accuracies

    print('computing total accuracy')
    total_correct = 0
    total_found = 0

    for k in sorted(scores.keys()):
        v = scores[k]
        total_correct += v['n_correct']
        total_found += v.get('n_found', 0)

    print("total correct: {}, total found: {}".format(total_correct, total_found))
    total_accuracy = float(total_correct)/total_found
    print("total acc: {}".format(total_accuracy))
    return scores, total_correct, total_found, total_accuracy

In [20]:
# do it on the restricted weights, i.e. first 300000 word vectors (roughly sorted by frequency as per Mikolov's post)
start = time.time()
scores, total_correct, total_found, total_accuracy = get_wordanalogy_scores("./", W2V_dict, W2V_weights, True) # False means we have embeddings for uppercased words, True menas embeddings for lowercase
end = time.time()
print("Completed, takes: {} seconds".format(end - start))

Done scanning, threw 438 examples out of 19106 total = 2.292473568512509 % discarded
finished batch 1 out of 14, took 0.4430065155029297 seconds
finished batch 2 out of 15, took 0.7773175239562988 seconds
finished batch 3 out of 15, took 0.44916510581970215 seconds
finished batch 4 out of 16, took 0.8971123695373535 seconds
finished batch 5 out of 17, took 1.3446683883666992 seconds
finished batch 6 out of 18, took 1.7935545444488525 seconds
finished batch 7 out of 19, took 2.243464708328247 seconds
finished batch 8 out of 20, took 2.6937456130981445 seconds
finished batch 9 out of 21, took 3.1356089115142822 seconds
finished batch 10 out of 22, took 3.5828776359558105 seconds
finished batch 11 out of 23, took 3.9767773151397705 seconds
finished batch 12 out of 23, took 0.4341607093811035 seconds
finished batch 13 out of 24, took 0.8526771068572998 seconds
finished batch 14 out of 24, took 0.4484748840332031 seconds
finished batch 15 out of 25, took 0.9016714096069336 seconds
finished 

In [17]:
with open('../data/scores/{}/{}.txt'.format(EXPERIMENT_FOLDER_NAME,'analogies_' + str(datetime.now())), 'w') as out:
  out.write("Total accuracy: {}. Total correct {} out of total found {}".format(total_accuracy, total_correct, total_found))

0.0
