## **We will implement Word2Vec. (Skip-gram)**

# **1. Build corpus from nltk brown.**

In [21]:
import re
import nltk
import itertools
nltk.download('brown')
from nltk.corpus import brown

corpus = []

for cat in ['news']:
    for text_id in brown.fileids(cat):
        raw_text = list(itertools.chain.from_iterable(brown.sents(text_id)))
        text = ' '.join(raw_text)
        text = text.lower()
        text.replace('\n', ' ')
        text = re.sub('[^a-z ]+', '', text)
        corpus.append([w for w in text.split() if w != ''])
temp=[]
for i in corpus:
  temp = temp+i
print(len(corpus),len(temp))

[nltk_data] Downloading package brown to
[nltk_data]     C:\Users\master\AppData\Roaming\nltk_data...
[nltk_data]   Package brown is already up-to-date!


44 87004


# **2. Implement subsampling & Make vocabulary.** 

In [22]:
from collections import Counter
import random, math

def subsample_frequent_words(corpus):
    filtered_corpus = []
    word_counts = dict(Counter(list(itertools.chain.from_iterable(corpus))))
    sum_word_counts = sum(list(word_counts.values()))
    word_counts = {word: word_counts[word]/float(sum_word_counts) for word in word_counts}
    for text in corpus:
        filtered_corpus.append([])
        for word in text:
            if random.random() < (1-math.sqrt(1e-5/word_counts[word])):
            #if random.random() < (1+math.sqrt(word_counts[word] * 1e3)) * 1e-3 / float(word_counts[word]):
                filtered_corpus[-1].append(word)
    return filtered_corpus

corpus = subsample_frequent_words(corpus)
temp=[]
for i in corpus:
      temp = temp+i
print(len(corpus), len(temp))

vocabulary = set(itertools.chain.from_iterable(corpus))
word_to_index = {w: idx for (idx, w) in enumerate(vocabulary)}
index_to_word = {idx: w for (idx, w) in enumerate(vocabulary)}

44 66648


# **3. Building bag of words**

In [3]:
import numpy as np

context_tuple_list = []
w = 4

for text in corpus:
    for i, word in enumerate(text):
        first_context_word_index = max(0,i-w)
        last_context_word_index = min(i+w, len(text))
        for j in range(first_context_word_index, last_context_word_index+1):
        #for j in range(first_context_word_index, last_context_word_index):
            if j == len(text):
                  continue;
            if i!=j:
                  context_tuple_list.append((word, text[j]))
            
print("There are {} pairs of target and context words".format(len(context_tuple_list)))

There are 535096 pairs of target and context words


# **4. Implement network.**

**4-1. CBOW**

In [4]:
import torch
import torch.nn as nn
import torch.autograd as autograd
import torch.optim as optim
import torch.nn.functional as F


class CBOW(nn.Module):

    def __init__(self, embedding_size, vocab_size):
        super(CBOW, self).__init__()
        self.embeddings = nn.Embedding(vocab_size, embedding_size)
        self.linear = nn.Linear(embedding_size, vocab_size)
        
    def forward(self, context_word):
        emb = self.embeddings(context_word)
        hidden = self.linear(emb)
        out = F.log_softmax(hidden)
        return out

**4-2. Skip-gram**

In [5]:
import torch
import torch.nn as nn
import torch.autograd as autograd
import torch.optim as optim
import torch.nn.functional as F


class Skip_gram(nn.Module):

    def __init__(self, embedding_size, vocab_size):
        super(Skip_gram, self).__init__()
        self.embeddings = nn.Embedding(vocab_size, embedding_size)
        self.linear = nn.Linear(embedding_size, vocab_size)
        
    def forward(self, target_word):
        emb = self.embeddings(target_word)
        hidden = self.linear(emb)
        out = F.log_softmax(hidden)
        return out

# **5. Training network.**

**5-1. Implement earlystopping**

In [6]:
class EarlyStopping():
    def __init__(self, patience=5, min_percent_gain=0.1):
        self.patience = patience
        self.loss_list = []
        self.min_percent_gain = min_percent_gain / 100.
        
    def update_loss(self, loss):
        self.loss_list.append(loss)
        if len(self.loss_list) > self.patience:
            del self.loss_list[0]
    
    def stop_training(self):
        if len(self.loss_list) == 1:
            return False
        gain = (max(self.loss_list) - min(self.loss_list)) / max(self.loss_list)
        print("Loss gain: {}%".format(round(100*gain,2)))
        if gain < self.min_percent_gain:
            return True
        else:
            return False

**5-2. Implemnet get batches function**

In [7]:
import random

def get_batches(context_tuple_list, batch_size=100):
    random.shuffle(context_tuple_list)
    batches = []
    batch_target, batch_context = [], []
    for i in range(len(context_tuple_list)):
        batch_target.append(word_to_index[context_tuple_list[i][0]])
        batch_context.append(word_to_index[context_tuple_list[i][1]])
        if (i+1) % batch_size == 0 or i == len(context_tuple_list)-1:
            tensor_target = autograd.Variable(torch.from_numpy(np.array(batch_target)).long())
            tensor_context = autograd.Variable(torch.from_numpy(np.array(batch_context)).long())
            batches.append((tensor_target, tensor_context))
            batch_target, batch_context = [], []
    return batches

**5-3. Training model**

In [8]:
vocabulary_size = len(vocabulary)

net = Skip_gram(embedding_size=2, vocab_size=vocabulary_size)
net= net.cuda()
loss_function = nn.CrossEntropyLoss()
optimizer = optim.Adam(net.parameters())
early_stopping = EarlyStopping()

context_tensor_list = []

context_tensor_list = []

for target, context in context_tuple_list:
    target_tensor = autograd.Variable(torch.LongTensor([word_to_index[target]]))
    context_tensor = autograd.Variable(torch.LongTensor([word_to_index[context]]))
    context_tensor_list.append((target_tensor, context_tensor))

while True:
    losses = []
    context_tuple_batches = get_batches(context_tuple_list, batch_size=2000)
    for target_tensor, context_tensor in context_tuple_batches:
        target_tensor=target_tensor.cuda()
        context_tensor=context_tensor.cuda()
        net.zero_grad()
        log_probs = net(target_tensor)
        loss = loss_function(log_probs, context_tensor)
        loss.backward()
        optimizer.step()
        losses.append(loss.item())
    print("Loss: ", np.mean(losses))
    early_stopping.update_loss(np.mean(losses))
    if early_stopping.stop_training():
        break



Loss:  8.667883766231252
Loss:  8.274657875744264
Loss gain: 4.54%
Loss:  7.810480075096017
Loss gain: 9.89%
Loss:  7.298139027695157
Loss gain: 15.8%
Loss:  6.941162452768924
Loss gain: 19.92%
Loss:  6.727393842455166
Loss gain: 18.7%
Loss:  6.590473603846422
Loss gain: 15.62%
Loss:  6.498827443194034
Loss gain: 10.95%
Loss:  6.435112036875824
Loss gain: 7.29%
Loss:  6.390041886870541
Loss gain: 5.01%
Loss:  6.357535949393885
Loss gain: 3.53%
Loss:  6.333875874974835
Loss gain: 2.54%
Loss:  6.3160373915487265
Loss gain: 1.85%
Loss:  6.302103690247037
Loss gain: 1.38%
Loss:  6.291039975721445
Loss gain: 1.05%
Loss:  6.281961015800931
Loss gain: 0.82%
Loss:  6.274257272037108
Loss gain: 0.66%
Loss:  6.268058426344573
Loss gain: 0.54%
Loss:  6.262534856796265
Loss gain: 0.45%
Loss:  6.25797357843883
Loss gain: 0.38%
Loss:  6.253947149461775
Loss gain: 0.32%
Loss:  6.25068257637878
Loss gain: 0.28%
Loss:  6.2476917985659925
Loss gain: 0.24%
Loss:  6.2448978228355525
Loss gain: 0.21%
Loss:

# **6. Check original skip-gram result.**

In [9]:
import numpy as np

def get_closest_word(word, topn=5):
    word_distance = []
    emb = net.embeddings
    pdist = nn.PairwiseDistance()
    i = word_to_index[word]
    lookup_tensor_i = torch.tensor([i], dtype=torch.long)
    v_i = emb(lookup_tensor_i)
    for j in range(len(vocabulary)):
        if j != i:
            lookup_tensor_j = torch.tensor([j], dtype=torch.long)
            v_j = emb(lookup_tensor_j)
            word_distance.append((index_to_word[j], float(pdist(v_i, v_j))))
    word_distance.sort(key=lambda x: x[1])
    return word_distance[:topn]
net=net.cpu()
get_closest_word("air")


[('pedestal', 0.008338884450495243),
 ('date', 0.010459519922733307),
 ('impressive', 0.019094765186309814),
 ('enforcement', 0.019178779795765877),
 ('hearing', 0.019241103902459145)]

# **7. Implement skip-gram with negative sampling.**

In [10]:
def get_batches(context_tuple_list, batch_size=100):
    random.shuffle(context_tuple_list)
    batches = []
    batch_target, batch_context, batch_negative = [], [], []
    for i in range(len(context_tuple_list)):
        batch_target.append(word_to_index[context_tuple_list[i][0]])
        batch_context.append(word_to_index[context_tuple_list[i][1]])
        batch_negative.append([word_to_index[w] for w in context_tuple_list[i][2]])
        if (i+1) % batch_size == 0 or i == len(context_tuple_list)-1:
            tensor_target = autograd.Variable(torch.from_numpy(np.array(batch_target)).long())
            tensor_context = autograd.Variable(torch.from_numpy(np.array(batch_context)).long())
            tensor_negative = autograd.Variable(torch.from_numpy(np.array(batch_negative)).long())
            batches.append((tensor_target, tensor_context, tensor_negative))
            batch_target, batch_context, batch_negative = [], [], []
    return batches

In [24]:
from numpy.random import multinomial

def sample_negative(sample_size):
    sample_probability = {}
    word_counts = dict(Counter(list(itertools.chain.from_iterable(corpus))))
    normalizing_factor = sum([v**0.75 for v in word_counts.values()])
    for word in word_counts:
        sample_probability[word] = word_counts[word]**0.75 / normalizing_factor
    words = np.array(list(word_counts.keys()))
    while True:
        word_list = []
        sampled_index = np.array(multinomial(sample_size, list(sample_probability.values())))
        for index, count in enumerate(sampled_index):
            for _ in range(count):
                 word_list.append(words[index])
        yield word_list

In [None]:
import numpy as np

context_tuple_list = []
w = 4
negative_samples = sample_negative(8)

for text in corpus:
    for i, word in enumerate(text):
        first_context_word_index = max(0,i-w)
        last_context_word_index = min(i+w, len(text))
        for j in range(first_context_word_index, last_context_word_index):
            if i!=j:
                context_tuple_list.append((word, text[j], next(negative_samples)))
print("There are {} pairs of target and context words".format(len(context_tuple_list)))

In [28]:
context_tuple_list[10]

('county',
 'jury',
 ['in', 'and', 'given', 'during', 'instead', 'north', 'silk', 'bail'])

In [13]:
import torch
import torch.nn as nn
import torch.autograd as autograd
import torch.optim as optim
import torch.nn.functional as F


class Word2Vec(nn.Module):

    def __init__(self, embedding_size, vocab_size):
        super(Word2Vec, self).__init__()
        self.embeddings_target = nn.Embedding(vocab_size, embedding_size)
        self.embeddings_context = nn.Embedding(vocab_size, embedding_size)

    def forward(self, target_word, context_word, negative_example):
        emb_target = self.embeddings_target(target_word)
        emb_context = self.embeddings_context(context_word)
        emb_product = torch.mul(emb_target, emb_context)
        emb_product = torch.sum(emb_product, dim=1)
        out = torch.sum(F.logsigmoid(emb_product))
        emb_negative = self.embeddings_context(negative_example)
        emb_product = torch.bmm(emb_negative, emb_target.unsqueeze(2))
        emb_product = torch.sum(emb_product, dim=1)
        out += torch.sum(F.logsigmoid(-emb_product))
        return -out

In [14]:
import time

vocabulary_size = len(vocabulary)

#loss_function = nn.CrossEntropyLoss()
net = Word2Vec(embedding_size=200, vocab_size=vocabulary_size)
net=net.cuda()
optimizer = optim.Adam(net.parameters())
early_stopping = EarlyStopping(patience=5, min_percent_gain=1)

while True:
    losses = []
    context_tuple_batches = get_batches(context_tuple_list, batch_size=2000)
    for i in range(len(context_tuple_batches)):
        net.zero_grad()
        target_tensor, context_tensor, negative_tensor = context_tuple_batches[i]
        target_tensor, context_tensor, negative_tensor = target_tensor.cuda(), context_tensor.cuda(), negative_tensor.cuda()
        loss = net(target_tensor, context_tensor, negative_tensor)
        loss.backward()
        optimizer.step()
        losses.append(loss.item())
    print("Loss: ", np.mean(losses))
    early_stopping.update_loss(np.mean(losses))
    if early_stopping.stop_training():
        break

Loss:  37674.032081117024
Loss:  29828.38278964428
Loss gain: 20.83%
Loss:  24712.928187333775
Loss gain: 34.4%
Loss:  20838.180414727394
Loss gain: 44.69%
Loss:  17793.955802235705
Loss gain: 52.77%
Loss:  15349.836066842587
Loss gain: 48.54%
Loss:  13357.566818691821
Loss gain: 45.95%
Loss:  11718.488663044382
Loss gain: 43.76%
Loss:  10358.081783992686
Loss gain: 41.79%
Loss:  9217.57806370512
Loss gain: 39.95%
Loss:  8249.668448200631
Loss gain: 38.24%
Loss:  7420.793245615858
Loss gain: 36.67%
Loss:  6704.4787696351395
Loss gain: 35.27%
Loss:  6079.666338202293
Loss gain: 34.04%
Loss:  5531.267810058594
Loss gain: 32.95%
Loss:  5047.107274871177
Loss gain: 31.99%
Loss:  4618.132390967836
Loss gain: 31.12%
Loss:  4235.805659387467
Loss gain: 30.33%
Loss:  3894.0056012092755
Loss gain: 29.6%
Loss:  3587.2438827189994
Loss gain: 28.92%
Loss:  3311.2305193961934
Loss gain: 28.3%
Loss:  3062.476220703125
Loss gain: 27.7%
Loss:  2837.128860798288
Loss gain: 27.14%
Loss:  2632.4145702605

In [15]:
import numpy as np

def get_closest_word(word, topn=100):
    word_distance = []
    emb = net.embeddings_target
    pdist = nn.PairwiseDistance()
    i = word_to_index[word]
    lookup_tensor_i = torch.tensor([i], dtype=torch.long)
    v_i = emb(lookup_tensor_i)
    for j in range(len(vocabulary)):
        if j != i:
            lookup_tensor_j = torch.tensor([j], dtype=torch.long)
            v_j = emb(lookup_tensor_j)
            word_distance.append((index_to_word[j], float(pdist(v_i, v_j))))
    word_distance.sort(key=lambda x: x[1])
    return word_distance[:topn]
net=net.cpu()
get_closest_word("county")

[('by', 13.958552360534668),
 ('and', 14.099173545837402),
 ('the', 14.320446014404297),
 ('in', 14.530234336853027),
 ('a', 14.558225631713867),
 ('of', 14.6000394821167),
 ('for', 14.89257526397705),
 ('to', 14.991289138793945),
 ('that', 15.217554092407227),
 ('was', 15.406394958496094),
 ('from', 15.501995086669922),
 ('he', 15.659364700317383),
 ('with', 15.713711738586426),
 ('under', 15.755976676940918),
 ('at', 15.783194541931152),
 ('their', 15.806936264038086),
 ('they', 15.893959045410156),
 ('were', 16.13153648376465),
 ('is', 16.161605834960938),
 ('on', 16.317657470703125),
 ('its', 16.333906173706055),
 ('have', 16.338603973388672),
 ('his', 16.423620223999023),
 ('than', 16.457183837890625),
 ('are', 16.563343048095703),
 ('would', 16.587234497070312),
 ('says', 16.665430068969727),
 ('doctor', 16.695606231689453),
 ('other', 16.72442054748535),
 ('since', 16.740633010864258),
 ('it', 16.76555633544922),
 ('i', 16.930435180664062),
 ('out', 16.932209014892578),
 ('warmt