# 词向量

在这一份notebook中，我们会（尽可能）尝试复现论文[Distributed Representations of Words and Phrases and their Compositionality](http://papers.nips.cc/paper/5021-distributed-representations-of-words-and-phrases-and-their-compositionality.pdf)中训练词向量的方法. 我们会实现Skip-gram模型，并且使用论文中noice contrastive sampling的目标函数。

这篇论文有很多模型实现的细节，这些细节对于词向量的好坏至关重要。我们虽然无法完全复现论文中的实验结果，主要是由于计算资源等各种细节原因，但是我们还是可以大致展示如何训练词向量。

In [14]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.utils.data as tud
from torch.nn.parameter import Parameter

from collections import Counter
import numpy as np
import random
import math

import pandas as pd
import scipy
import sklearn
from sklearn.metrics.pairwise import cosine_similarity

USE_CUDA = torch.cuda.is_available()

random.seed(53113)
np.random.seed(53113)
torch.manual_seed(53113)
if USE_CUDA:
    torch.cuda.manual_seed(53113)
    
# Set the hyperparameters
# Please try different hyper parameters and report your discoveries in the writeup. 
    
K = 100 # number of negative samples
C = 3 # nearby words threshold
T = 1. # subsampling threshold
NUM_EPOCHS = 10 # The number of epochs of training
MAX_VOCAB_SIZE = 30000 # the vocabulary size
BATCH_SIZE = 128 # the batch size
LEARNING_RATE = 0.2 # the initial learning rate
SUBSAMPLING = True
EMBEDDING_SIZE = 100
       
    
LOG_FILE = "word-embedding.log"

# the function we use to tokenize the document into words
def word_tokenize(text):
    return text.split()

Read the text from the datafile, and create the vocabulary (including the label for unknown word (UNK)), the word to index mapping, index to word mapping, word counts, (normalized) word frequency, and the total vocabulary size

In [15]:
with open("text8.train.txt", "r") as fin:
    text = fin.read()
    
text = [w for w in word_tokenize(text.lower())]
vocab = dict(Counter(text).most_common(MAX_VOCAB_SIZE-1))
vocab["<unk>"] = len(text) - np.sum(list(vocab.values()))
idx_to_word = [word for word in vocab.keys()] 
word_to_idx = {word:i for i, word in enumerate(idx_to_word)}

word_counts = np.array([count for count in vocab.values()], dtype=np.float32)
word_freqs = word_counts / np.sum(word_counts)
word_freqs = word_freqs ** (3./4.)
word_freqs = word_freqs / np.sum(word_freqs) # used to get negative samples
VOCAB_SIZE = len(idx_to_word)
VOCAB_SIZE

30000

### 实现Dataloader

一个dataloader需要以下内容：

- store all the encoded text, and preprocess the encoded text by subsampling. We provide a 
- 把所有
- maintain the vocabulary, word counts, the normalized word frequency
- sample a center word at each iteration
- return the context words given a center word
- get negative samples by the normalized word frequencies
- return the word counts for subsampling

Here is a nice tutorial about how to use [PyTorch dataloader](https://pytorch.org/tutorials/beginner/data_loading_tutorial.html). To use the dataloader, you only need to define two functions:

- ```__len__``` function returns the total size of your dataset. 
- ```__get__``` item returns the item given an index.

In [16]:
class WordEmbeddingDataset(tud.Dataset):
    def __init__(self, text, word_to_idx, idx_to_word, word_freqs, word_counts):
        ''' text: a list of words, all text from the training dataset
            word_to_idx: the dictionary from word to idx
            idx_to_word: idx to word mapping
            word_freq: the frequency of each word
            word_counts: the word counts
        '''
        super(WordEmbeddingDataset, self).__init__()
        self.text_encoded = [word_to_idx.get(t, VOCAB_SIZE-1) for t in text]
        
        if SUBSAMPLING:
            P = {}
            for idx, x in enumerate(word_freqs):
                y = (math.sqrt(x/0.001)+1)*0.001/x
                P[idx] = y
            subsampled_data = []
            for word_idx in self.text_encoded:
                if random.random()<P[word_idx]:
                    subsampled_data.append(word_idx)
            self.text_encoded = subsampled_data
        
        self.text_encoded = torch.Tensor(self.text_encoded).long()
        
        
        self.word_to_idx = word_to_idx
        self.idx_to_word = idx_to_word
        self.word_freqs = torch.Tensor(word_freqs)
        self.word_counts = torch.Tensor(word_counts)
        
    def __len__(self):
        ''' return the length of the whole dataset (the length of text)
        '''
        return len(self.text_encoded)
        
    def __getitem__(self, idx):
        ''' This function returns the following data for training
            - center word at location idx
            - the surrounding words near idx of window size C (positive words)
            - the negative sampled words, sample K words for each surrounding word (negative words)
        '''
        
        # TODO
        center_word = self.text_encoded[idx]
        
        pos_indices = list(range(idx-C, idx)) + list(range(idx+1, idx+C+1))
        pos_indices = [i%len(self.text_encoded) for i in pos_indices]
        pos_words = self.text_encoded[pos_indices] 

        neg_words = torch.multinomial(self.word_freqs, K * pos_words.shape[0], True)
        
        return center_word, pos_words, neg_words

# Create the dataset and dataloader
dataset = WordEmbeddingDataset(text, word_to_idx, idx_to_word, word_freqs, word_counts)
dataloader = tud.DataLoader(dataset, batch_size=BATCH_SIZE, shuffle=True, num_workers=4)      

### 定义PyTorch模型

In [17]:
class EmbeddingModel(nn.Module):
    def __init__(self, vocab_size, embed_size):
        ''' 初始化输出和输出embedding
        '''
        super(EmbeddingModel, self).__init__()
        self.vocab_size = vocab_size
        self.embed_size = embed_size
        
        initrange = 0.5 / self.embed_size
        self.out_embed = nn.Embedding(self.vocab_size, self.embed_size, sparse=False)
        self.out_embed.weight.data.uniform_(-initrange, initrange)
        
        
        self.in_embed = nn.Embedding(self.vocab_size, self.embed_size, sparse=False)
        self.in_embed.weight.data.uniform_(-initrange, initrange)
        
        
    def forward(self, input_labels, pos_labels, neg_labels):
        '''
        input_labels: 中心词, [batch_size]
        pos_labels: 中心词周围 context window 出现过的单词 [batch_size * (window_size * 2)]
        neg_labelss: 中心词周围没有出现过的单词，从 negative sampling 得到 [batch_size, (window_size * 2 * K)]
        
        return: loss, [batch_size]
        '''
        
        batch_size = input_labels.size(0)
        
        input_embedding = self.in_embed(input_labels) # B * embed_size
        pos_embedding = self.out_embed(pos_labels) # B * (2*C) * embed_size
        neg_embedding = self.out_embed(neg_labels) # B * (2*C * K) * embed_size
      
        log_pos = torch.bmm(pos_embedding, input_embedding.unsqueeze(2)).squeeze() # B * (2*C)
        log_neg = torch.bmm(neg_embedding, -input_embedding.unsqueeze(2)).squeeze() # B * (2*C*K)

        log_pos = F.logsigmoid(log_pos).sum(1)
        log_neg = F.logsigmoid(log_neg).sum(1) # batch_size
       
        loss = log_pos + log_neg
        
        return -loss
    
    def input_embeddings(self):
        return self.in_embed.weight.data.cpu().numpy()
        
model = EmbeddingModel(VOCAB_SIZE, EMBEDDING_SIZE)
if USE_CUDA:
    model = model.cuda()

下面是评估模型的代码，以及训练模型的代码

In [19]:
def evaluate(filename, embedding_weights): 
    if filename.endswith(".csv"):
        data = pd.read_csv(filename, sep=",")
    else:
        data = pd.read_csv(filename, sep="\t")
    human_similarity = []
    model_similarity = []
    for i in data.iloc[:, 0:2].index:
        word1, word2 = data.iloc[i, 0], data.iloc[i, 1]
        if word1 not in word_to_idx or word2 not in word_to_idx:
            continue
        else:
            word1_idx, word2_idx = word_to_idx[word1], word_to_idx[word2]
            word1_embed, word2_embed = embedding_weights[[word1_idx]], embedding_weights[[word2_idx]]
            model_similarity.append(float(sklearn.metrics.pairwise.cosine_similarity(word1_embed, word2_embed)))
            human_similarity.append(float(data.iloc[i, 2]))

    return scipy.stats.spearmanr(human_similarity, model_similarity)# , model_similarity

def find_nearest(word):
    index = word_to_idx[word]
    embedding = embedding_weights[index]
    cos_dis = np.array([scipy.spatial.distance.cosine(e, embedding) for e in embedding_weights])
    return [idx_to_word[i] for i in cos_dis.argsort()[:10]]


optimizer = torch.optim.SGD(model.parameters(), lr=LEARNING_RATE)
for e in range(NUM_EPOCHS):
    for i, (input_labels, pos_labels, neg_labels) in enumerate(dataloader):
        ''' - Convert the input tensors to the correct type (e.g., convert to CUDA tensors if you are training on GPU)
            - zero_grad the optimizer
            - compute the loss
            - run backward from the loss
            - run one step of the optimizer
            
            optionally:
            Do the following things to help you debug your code and monitor the training progress. 
            - print the batch loss after every 100 iterations
            - evaluate the model on three word semantic similarity dataset and print the statistics after every 10000 iterations
        '''
        
        # TODO
        input_labels = input_labels.long()
        pos_labels = pos_labels.long()
        neg_labels = neg_labels.long()
        if USE_CUDA:
            input_labels = input_labels.cuda()
            pos_labels = pos_labels.cuda()
            neg_labels = neg_labels.cuda()
            
        optimizer.zero_grad()
        loss = model(input_labels, pos_labels, neg_labels).mean()
        loss.backward()
        optimizer.step()

        if i % 100 == 0:
            with open(LOG_FILE, "a") as fout:
                fout.write("epoch: {}, iter: {}, loss: {}\n".format(e, i, loss.item()))
                print("epoch: {}, iter: {}, loss: {}".format(e, i, loss.item()))
            
        
        if i % 2000 == 0:
            embedding_weights = model.input_embeddings()
            sim_simlex = evaluate("simlex-999.txt", embedding_weights)
            sim_men = evaluate("men.txt", embedding_weights)
            sim_353 = evaluate("wordsim353.csv", embedding_weights)
            with open(LOG_FILE, "a") as fout:
                print("epoch: {}, iteration: {}, simlex-999: {}, men: {}, sim353: {}, nearest to monster: {}\n".format(
                    e, i, sim_simlex, sim_men, sim_353, find_nearest("monster")))
                fout.write("epoch: {}, iteration: {}, simlex-999: {}, men: {}, sim353: {}, nearest to monster: {}\n".format(
                    e, i, sim_simlex, sim_men, sim_353, find_nearest("monster")))
                
    embedding_weights = model.input_embeddings()
    np.save("embedding-{}".format(EMBEDDING_SIZE), embedding_weights)
    torch.save(model.state_dict(), "embedding-{}.th".format(EMBEDDING_SIZE))

epoch: 0, iter: 0, loss: 75.98824310302734
epoch: 0, iteration: 0, simlex-999: SpearmanrResult(correlation=-0.027985322356834944, pvalue=0.38716202019675516), men: SpearmanrResult(correlation=0.05255807622508304, pvalue=0.0074537177246504095), sim353: SpearmanrResult(correlation=-0.013661188678445277, pvalue=0.8082642425373302), nearest to monster: ['monster', 'shall', 'respect', 'represent', 'indonesia', 'differences', 'zone', 'practical', 'jerry', 'historians']

epoch: 0, iter: 100, loss: 83.71326446533203
epoch: 0, iter: 200, loss: 60.44464874267578
epoch: 0, iter: 300, loss: 67.8129653930664
epoch: 0, iter: 400, loss: 72.78404235839844
epoch: 0, iter: 500, loss: 61.33769989013672
epoch: 0, iter: 600, loss: 66.45417022705078
epoch: 0, iter: 700, loss: 64.60382080078125
epoch: 0, iter: 800, loss: 50.91881561279297
epoch: 0, iter: 900, loss: 62.70906066894531
epoch: 0, iter: 1000, loss: 54.50410461425781
epoch: 0, iter: 1100, loss: 51.664127349853516
epoch: 0, iter: 1200, loss: 51.443

epoch: 0, iter: 12100, loss: 34.37571716308594
epoch: 0, iter: 12200, loss: 33.34493637084961
epoch: 0, iter: 12300, loss: 34.015419006347656
epoch: 0, iter: 12400, loss: 32.90447235107422
epoch: 0, iter: 12500, loss: 34.26510238647461
epoch: 0, iter: 12600, loss: 34.073951721191406
epoch: 0, iter: 12700, loss: 33.46040725708008
epoch: 0, iter: 12800, loss: 35.63954162597656
epoch: 0, iter: 12900, loss: 34.36619567871094
epoch: 0, iter: 13000, loss: 32.669822692871094
epoch: 0, iter: 13100, loss: 33.55662536621094
epoch: 0, iter: 13200, loss: 34.199832916259766
epoch: 0, iter: 13300, loss: 34.72283935546875
epoch: 0, iter: 13400, loss: 36.48380661010742
epoch: 0, iter: 13500, loss: 34.06208419799805
epoch: 0, iter: 13600, loss: 33.37775421142578
epoch: 0, iter: 13700, loss: 34.44108963012695
epoch: 0, iter: 13800, loss: 33.8083381652832
epoch: 0, iter: 13900, loss: 33.55755615234375
epoch: 0, iter: 14000, loss: 33.51102828979492
epoch: 0, iteration: 14000, simlex-999: SpearmanrResult(c

epoch: 0, iter: 24200, loss: 32.68549346923828
epoch: 0, iter: 24300, loss: 33.12438201904297
epoch: 0, iter: 24400, loss: 32.74500274658203
epoch: 0, iter: 24500, loss: 32.416725158691406
epoch: 0, iter: 24600, loss: 32.75374984741211
epoch: 0, iter: 24700, loss: 33.14791488647461
epoch: 0, iter: 24800, loss: 32.1576042175293
epoch: 0, iter: 24900, loss: 32.75824737548828
epoch: 0, iter: 25000, loss: 32.772464752197266
epoch: 0, iter: 25100, loss: 32.51237106323242
epoch: 0, iter: 25200, loss: 33.332889556884766
epoch: 0, iter: 25300, loss: 33.03108596801758
epoch: 0, iter: 25400, loss: 32.843387603759766
epoch: 0, iter: 25500, loss: 32.54827117919922
epoch: 0, iter: 25600, loss: 32.58208465576172
epoch: 0, iter: 25700, loss: 32.54391860961914
epoch: 0, iter: 25800, loss: 32.62670135498047
epoch: 0, iter: 25900, loss: 32.51728057861328
epoch: 0, iter: 26000, loss: 32.307769775390625
epoch: 0, iteration: 26000, simlex-999: SpearmanrResult(correlation=0.07242217548925875, pvalue=0.02506

epoch: 0, iter: 36300, loss: 31.97287368774414
epoch: 0, iter: 36400, loss: 32.37919616699219
epoch: 0, iter: 36500, loss: 32.58899688720703
epoch: 0, iter: 36600, loss: 32.10148620605469
epoch: 0, iter: 36700, loss: 32.28821563720703
epoch: 0, iter: 36800, loss: 32.158447265625
epoch: 0, iter: 36900, loss: 32.34330749511719
epoch: 0, iter: 37000, loss: 32.43210220336914
epoch: 0, iter: 37100, loss: 32.32413101196289
epoch: 0, iter: 37200, loss: 32.38295364379883
epoch: 0, iter: 37300, loss: 32.26492691040039
epoch: 0, iter: 37400, loss: 32.17142868041992
epoch: 0, iter: 37500, loss: 32.52403259277344
epoch: 0, iter: 37600, loss: 31.6724853515625
epoch: 0, iter: 37700, loss: 32.03404235839844
epoch: 0, iter: 37800, loss: 32.210304260253906
epoch: 0, iter: 37900, loss: 31.980937957763672
epoch: 0, iter: 38000, loss: 31.95423126220703
epoch: 0, iteration: 38000, simlex-999: SpearmanrResult(correlation=0.08561918137427808, pvalue=0.008047488056354529), men: SpearmanrResult(correlation=0.1

epoch: 0, iter: 48400, loss: 32.20608139038086
epoch: 0, iter: 48500, loss: 31.85419464111328
epoch: 0, iter: 48600, loss: 31.568431854248047
epoch: 0, iter: 48700, loss: 31.94073486328125
epoch: 0, iter: 48800, loss: 32.03761291503906
epoch: 0, iter: 48900, loss: 32.14366149902344
epoch: 0, iter: 49000, loss: 31.83098030090332
epoch: 0, iter: 49100, loss: 32.2159423828125
epoch: 0, iter: 49200, loss: 31.69326400756836
epoch: 0, iter: 49300, loss: 31.929866790771484
epoch: 0, iter: 49400, loss: 31.86402702331543
epoch: 0, iter: 49500, loss: 31.363256454467773
epoch: 0, iter: 49600, loss: 32.11054992675781
epoch: 0, iter: 49700, loss: 32.43081283569336
epoch: 0, iter: 49800, loss: 31.8527774810791
epoch: 0, iter: 49900, loss: 32.14591979980469
epoch: 0, iter: 50000, loss: 31.907093048095703
epoch: 0, iteration: 50000, simlex-999: SpearmanrResult(correlation=0.09808617902785667, pvalue=0.0023839219861309436), men: SpearmanrResult(correlation=0.12500324766631712, pvalue=1.7152529581618855

epoch: 0, iter: 60500, loss: 31.94855499267578
epoch: 0, iter: 60600, loss: 32.16680908203125
epoch: 0, iter: 60700, loss: 31.794523239135742
epoch: 0, iter: 60800, loss: 31.75855827331543
epoch: 0, iter: 60900, loss: 32.057254791259766
epoch: 0, iter: 61000, loss: 31.51314926147461
epoch: 0, iter: 61100, loss: 32.1302375793457
epoch: 0, iter: 61200, loss: 32.26987075805664
epoch: 0, iter: 61300, loss: 31.70214080810547
epoch: 0, iter: 61400, loss: 31.92788314819336
epoch: 0, iter: 61500, loss: 31.450580596923828
epoch: 0, iter: 61600, loss: 31.70734977722168
epoch: 0, iter: 61700, loss: 32.19746780395508
epoch: 0, iter: 61800, loss: 31.911523818969727
epoch: 0, iter: 61900, loss: 31.86284828186035
epoch: 0, iter: 62000, loss: 32.12932205200195
epoch: 0, iteration: 62000, simlex-999: SpearmanrResult(correlation=0.10650249574333052, pvalue=0.0009675988797736672), men: SpearmanrResult(correlation=0.1342313796804003, pvalue=6.866874214223525e-12), sim353: SpearmanrResult(correlation=0.178

epoch: 0, iter: 72600, loss: 32.06332015991211
epoch: 0, iter: 72700, loss: 31.54588508605957
epoch: 0, iter: 72800, loss: 32.08677673339844
epoch: 0, iter: 72900, loss: 31.931236267089844
epoch: 0, iter: 73000, loss: 31.79207420349121
epoch: 0, iter: 73100, loss: 31.711414337158203
epoch: 0, iter: 73200, loss: 31.895408630371094
epoch: 0, iter: 73300, loss: 32.122703552246094
epoch: 0, iter: 73400, loss: 31.429895401000977
epoch: 0, iter: 73500, loss: 31.77893829345703
epoch: 0, iter: 73600, loss: 31.685319900512695
epoch: 0, iter: 73700, loss: 31.391231536865234
epoch: 0, iter: 73800, loss: 31.570140838623047
epoch: 0, iter: 73900, loss: 31.903995513916016
epoch: 0, iter: 74000, loss: 31.75525665283203
epoch: 0, iteration: 74000, simlex-999: SpearmanrResult(correlation=0.11789655043261746, pvalue=0.0002570150735033117), men: SpearmanrResult(correlation=0.14072276868996608, pvalue=6.226570637879398e-13), sim353: SpearmanrResult(correlation=0.19418345265440315, pvalue=0.000497023432840

epoch: 0, iter: 84600, loss: 31.45741844177246
epoch: 0, iter: 84700, loss: 31.38296127319336
epoch: 0, iter: 84800, loss: 32.034305572509766
epoch: 0, iter: 84900, loss: 31.581586837768555
epoch: 0, iter: 85000, loss: 31.44800567626953
epoch: 0, iter: 85100, loss: 31.26881217956543
epoch: 0, iter: 85200, loss: 31.631162643432617
epoch: 0, iter: 85300, loss: 31.152894973754883
epoch: 0, iter: 85400, loss: 31.4873046875
epoch: 0, iter: 85500, loss: 31.22477912902832
epoch: 0, iter: 85600, loss: 31.850135803222656
epoch: 0, iter: 85700, loss: 31.59015655517578
epoch: 0, iter: 85800, loss: 31.02224349975586
epoch: 0, iter: 85900, loss: 31.68284797668457
epoch: 0, iter: 86000, loss: 31.70503044128418
epoch: 0, iteration: 86000, simlex-999: SpearmanrResult(correlation=0.12651179472506033, pvalue=8.69613536979451e-05), men: SpearmanrResult(correlation=0.14799131654954098, pvalue=3.7001124612891766e-14), sim353: SpearmanrResult(correlation=0.2054781523685685, pvalue=0.00022494898768681283), n

epoch: 0, iter: 96700, loss: 31.797943115234375
epoch: 0, iter: 96800, loss: 31.560855865478516
epoch: 0, iter: 96900, loss: 31.830810546875
epoch: 0, iter: 97000, loss: 31.86542510986328
epoch: 0, iter: 97100, loss: 31.803014755249023
epoch: 0, iter: 97200, loss: 31.33492088317871
epoch: 0, iter: 97300, loss: 31.66841697692871
epoch: 0, iter: 97400, loss: 31.712961196899414
epoch: 0, iter: 97500, loss: 32.221771240234375
epoch: 0, iter: 97600, loss: 31.681991577148438
epoch: 0, iter: 97700, loss: 31.145668029785156
epoch: 0, iter: 97800, loss: 31.821348190307617
epoch: 0, iter: 97900, loss: 31.678552627563477
epoch: 0, iter: 98000, loss: 31.360010147094727
epoch: 0, iteration: 98000, simlex-999: SpearmanrResult(correlation=0.13798246326425476, pvalue=1.8400025247265236e-05), men: SpearmanrResult(correlation=0.1517099506106283, pvalue=8.257168558963166e-15), sim353: SpearmanrResult(correlation=0.21818886444179086, pvalue=8.748601768157031e-05), nearest to monster: ['monster', 'demon', 

epoch: 1, iter: 9500, loss: 31.519529342651367
epoch: 1, iter: 9600, loss: 31.53604507446289
epoch: 1, iter: 9700, loss: 31.51484489440918
epoch: 1, iter: 9800, loss: 31.49486541748047
epoch: 1, iter: 9900, loss: 31.05791664123535
epoch: 1, iter: 10000, loss: 31.29766082763672
epoch: 1, iteration: 10000, simlex-999: SpearmanrResult(correlation=0.14224580895678246, pvalue=1.0000548589112858e-05), men: SpearmanrResult(correlation=0.15613446702225925, pvalue=1.3194538510503778e-15), sim353: SpearmanrResult(correlation=0.2235579190776937, pvalue=5.772368309630839e-05), nearest to monster: ['monster', 'demon', 'giant', 'definitive', 'warrior', 'joy', 'quote', 'ghost', 'sketch', 'bird']

epoch: 1, iter: 10100, loss: 31.538530349731445
epoch: 1, iter: 10200, loss: 31.4808406829834
epoch: 1, iter: 10300, loss: 31.295419692993164
epoch: 1, iter: 10400, loss: 31.025161743164062
epoch: 1, iter: 10500, loss: 31.204967498779297
epoch: 1, iter: 10600, loss: 31.481632232666016
epoch: 1, iter: 10700, 

epoch: 1, iter: 21600, loss: 31.346782684326172
epoch: 1, iter: 21700, loss: 31.316938400268555
epoch: 1, iter: 21800, loss: 31.716686248779297
epoch: 1, iter: 21900, loss: 31.169527053833008
epoch: 1, iter: 22000, loss: 31.45047378540039
epoch: 1, iteration: 22000, simlex-999: SpearmanrResult(correlation=0.14902618042853188, pvalue=3.655961940631879e-06), men: SpearmanrResult(correlation=0.15972720590580072, pvalue=2.860845488161654e-16), sim353: SpearmanrResult(correlation=0.23070832435161442, pvalue=3.265808381197622e-05), nearest to monster: ['monster', 'demon', 'giant', 'arrow', 'quote', 'triangle', 'warrior', 'sketch', 'joy', 'robot']

epoch: 1, iter: 22100, loss: 31.63100814819336
epoch: 1, iter: 22200, loss: 31.536144256591797
epoch: 1, iter: 22300, loss: 31.089426040649414
epoch: 1, iter: 22400, loss: 31.553382873535156
epoch: 1, iter: 22500, loss: 31.551122665405273
epoch: 1, iter: 22600, loss: 31.506092071533203
epoch: 1, iter: 22700, loss: 31.721607208251953
epoch: 1, iter:

Process Process-37:
Process Process-39:
Process Process-40:
Process Process-38:
Traceback (most recent call last):
  File "/project2/mpcs53113/anaconda3/lib/python3.6/multiprocessing/process.py", line 258, in _bootstrap
    self.run()


KeyboardInterrupt: 

Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
  File "/project2/mpcs53113/anaconda3/lib/python3.6/multiprocessing/process.py", line 93, in run
    self._target(*self._args, **self._kwargs)
  File "/project2/mpcs53113/anaconda3/lib/python3.6/multiprocessing/process.py", line 258, in _bootstrap
    self.run()
  File "/project2/mpcs53113/anaconda3/lib/python3.6/multiprocessing/process.py", line 258, in _bootstrap
    self.run()
  File "/project2/mpcs53113/anaconda3/lib/python3.6/multiprocessing/process.py", line 258, in _bootstrap
    self.run()
  File "/project2/mpcs53113/anaconda3/lib/python3.6/site-packages/torch/utils/data/dataloader.py", line 57, in _worker_loop
    samples = collate_fn([dataset[i] for i in batch_indices])
  File "/project2/mpcs53113/anaconda3/lib/python3.6/multiprocessing/process.py", line 93, in run
    self._target(*self._args, **self._kwargs)
  File "/project2/mpcs53113/anaconda3/lib/python3.6/multiproces

## Evaluation with MEN and Simplex-999

In [1]:
embedding_weights = model.input_embeddings()
print("simlex-999", evaluate("simlex-999.txt", embedding_weights))
print("men", evaluate("men.txt", embedding_weights))
print("wordsim353", evaluate("wordsim353.csv", embedding_weights))

NameError: name 'model' is not defined

## Find nearest neighbor

In [None]:
for word in ["good", "fresh", "monster", "green", "like", "america", "chicago", "work", "computer", "language"]:
    print(word, find_nearest(word))

## Analogical Reasoning

In [None]:
man_idx = word_to_idx["man"] 
king_idx = word_to_idx["king"] 
woman_idx = word_to_idx["woman"]
embedding = embedding_weights[woman_idx] - embedding_weights[man_idx] + embedding_weights[king_idx]
cos_dis = np.array([scipy.spatial.distance.cosine(e, embedding) for e in embedding_weights])
for i in cos_dis.argsort()[:20]:
    print(idx_to_word[i])