In [1]:
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
import matplotlib.pyplot as plt

# 1. Data

In [2]:
# 10 documents, each having 30 words
corpus = ["throughout history governments from around the world have undergone various modifications with the goal of improving capacity, efficiency, and citizen services. with the creation of cutting-edge technology, this progress was", 
          "one of the technologies that has shown to be extremely important for our daily life in all facets of the digital society is the internet of things (iot). iot is",
          "regarded as the foundation of smart cities, just as smart cities serve as the foundation of smart governments. this study explores the applications of iot in smart government along with",
          "by considering smart government as an extension of e-government many countries have started to invest in this domain. dubai, australia, singapore and moldova have already taken some initiatives and found ",
          "significant result of smart government (alonaizi & manuel, 2021). the phrase  indicates the update to e-government that enables users to quickly get government services using smart technologies by ",
          "interacting  with them in new ways as a result of the popularity of mobile applications, social media, and other smart devices (algebri et al., 2018). the cornerstone of creating public",
          "the relationship via which a government extends its value chain to people. the basis for the working definition utilized in this study is moore's thesis that public value incorporates",
          "real-time communication between individuals and the public sectors is a key indicator of how valuable e-government services are to the general public (karunasena & deng, 2012). socio cultural, economical, financial",
          "the word iot refers to a collection of real-world items that have sensors built in and are networked together to provide useful data. definitions of iot have evolved in line",
          "the bulk of iot efforts have been implemented in the commercial sectors, however researches have shown that iot deployment in the public sector has significantly increased"
         ]

In [3]:
corpus = [sent.split(" ") for sent in corpus]
#corpus

In [5]:
#get word sequences and unique words
flatten = lambda l: [item for sublist in l for item in sublist]
vocab = list(set(flatten(corpus)))
#vocab

In [6]:
#numericalization
word2index = {w: i for i, w in enumerate(vocab)}
#print(word2index)

In [7]:
#vocab size
voc_size = len(vocab)
print(voc_size)

177


In [8]:
#append UNK
vocab.append('<UNK>')

In [9]:
word2index['<UNK>'] = 177

In [10]:
#just in case we need to use
index2word = {v:k for k, v in word2index.items()} 

# 2. Prepare Train Data

In [138]:
for c in corpus:
    print(c)

['throughout', 'history', 'governments', 'from', 'around', 'the', 'world', 'have', 'undergone', 'various', 'modifications', 'with', 'the', 'goal', 'of', 'improving', 'capacity,', 'efficiency,', 'and', 'citizen', 'services.', 'with', 'the', 'creation', 'of', 'cutting-edge', 'technology,', 'this', 'progress', 'was']
['one', 'of', 'the', 'technologies', 'that', 'has', 'shown', 'to', 'be', 'extremely', 'important', 'for', 'our', 'daily', 'life', 'in', 'all', 'facets', 'of', 'the', 'digital', 'society', 'is', 'the', 'internet', 'of', 'things', '(iot).', 'iot', 'is']
['regarded', 'as', 'the', 'foundation', 'of', 'smart', 'cities,', 'just', 'as', 'smart', 'cities', 'serve', 'as', 'the', 'foundation', 'of', 'smart', 'governments.', 'this', 'study', 'explores', 'the', 'applications', 'of', 'iot', 'in', 'smart', 'government', 'along', 'with']
['by', 'considering', 'smart', 'government', 'as', 'an', 'extension', 'of', 'e-government', 'many', 'countries', 'have', 'started', 'to', 'invest', 'in', '

In [139]:
#word2index
skipgrams = []

#for each corpus
for sent in corpus:
    for i in range(2, len(sent) - 2): 
        center_word = sent[i]
        outside_words = [sent[i-2], sent[i-1], sent[i+1], sent[i+2]]  #window_size = 2
        for o in outside_words:
            skipgrams.append([center_word, o])

#skipgrams

In [11]:
def random_batch(batch_size, word_sequence):
    
    # Make skip gram of window size 2
    skip_grams = []
    # loop each word sequence
    for sent in corpus:
        for i in range(2, len(sent) - 2):
            target = word2index[sent[i]]
            context = [word2index[sent[i - 2]], word2index[sent[i - 1]], word2index[sent[i + 1]], word2index[sent[i + 2]]] # window size 2
            for w in context:
                skip_grams.append([target, w])
    
    random_inputs = []
    random_labels = []
    random_index = np.random.choice(range(len(skip_grams)), batch_size, replace=False) #randomly pick without replacement
        
    for i in random_index:
        random_inputs.append([skip_grams[i][0]])  # target, e.g., 2
        random_labels.append([skip_grams[i][1]])  # context word, e.g., 3
            
    return np.array(random_inputs), np.array(random_labels)

## Testing the method

In [12]:
#testing the method
batch_size = 2 # mini-batch size
input_batch, target_batch = random_batch(batch_size, corpus)

print("Input: ", input_batch)
print("Target: ", target_batch)

Input:  [[102]
 [130]]
Target:  [[ 11]
 [144]]


# 3. Model

In [13]:
class Skipgram(nn.Module):
    
    def __init__(self, vocab_size, emb_size):
        super(Skipgram,self).__init__()
        self.embedding_v = nn.Embedding(vocab_size, emb_size)
        self.embedding_u = nn.Embedding(vocab_size, emb_size)
    
    def forward(self, center_words, target_words, all_vocabs):
        center_embeds = self.embedding_v(center_words)# [batch_size, 1, emb_size]
        target_embeds = self.embedding_u(target_words) # [batch_size, 1, emb_size]
        all_embeds    = self.embedding_v(all_vocabs) #   [batch_size, voc_size, emb_size]
        # print(all_embeds.shape)
        
        scores      = target_embeds.bmm(center_embeds.transpose(1, 2)).squeeze(2)
        #[batch_size, 1, emb_size] @ [batch_size, emb_size, 1] = [batch_size, 1, 1] = [batch_size, 1]

        norm_scores = all_embeds.bmm(center_embeds.transpose(1, 2)).squeeze(2)
        #[batch_size, voc_size, emb_size] @ [batch_size, emb_size, 1] = [batch_size, voc_size, 1] = [batch_size, voc_size]

        nll = -torch.mean(torch.log(torch.exp(scores)/torch.sum(torch.exp(norm_scores), 1).unsqueeze(1))) # log-softmax
        # scalar (loss must be scalar)    
            
        return nll # negative log likelihood

# 4. Training

In [14]:
batch_size     = 2 # mini-batch size
embedding_size = 2 #so we can later plot
model          = Skipgram(len(vocab), embedding_size)

optimizer = optim.Adam(model.parameters(), lr=0.001)

In [15]:
model

Skipgram(
  (embedding_v): Embedding(178, 2)
  (embedding_u): Embedding(178, 2)
)

In [16]:
def prepare_sequence(seq, word2index):
    idxs = list(map(lambda w: word2index[w] if word2index.get(w) is not None else word2index["<UNK>"], seq))
    return torch.LongTensor(idxs)

#use for the normalized term in the probability calculation
all_vocabs = prepare_sequence(list(vocab), word2index).expand(batch_size, len(vocab))  # [batch_size, voc_size]
all_vocabs.shape

torch.Size([2, 178])

In [17]:
#all_vocabs

In [18]:
def epoch_time(start_time, end_time):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs

In [20]:
import time

# Training
num_epochs = 10000
for epoch in range(num_epochs):
    
    start = time.time()
    
    input_batch, target_batch = random_batch(batch_size, corpus)
    input_batch  = torch.LongTensor(input_batch)  
    target_batch = torch.LongTensor(target_batch) 
    # print(input_batch.shape)
    # print(target_batch.shape)

    optimizer.zero_grad()
    loss = model(input_batch, target_batch, all_vocabs)
    
    loss.backward()
    optimizer.step()
    
    end = time.time()
    
    epoch_mins, epoch_secs = epoch_time(start, end)

    if (epoch + 1) % 1000 == 0:
        print(f"Epoch: {epoch + 1} | cost: {loss:.6f} | time: {epoch_mins}m {epoch_secs}s")


Epoch: 1000 | cost: 5.877048 | time: 0m 0s
Epoch: 2000 | cost: 4.822315 | time: 0m 0s
Epoch: 3000 | cost: 5.309200 | time: 0m 0s
Epoch: 4000 | cost: 5.415352 | time: 0m 0s
Epoch: 5000 | cost: 4.698609 | time: 0m 0s
Epoch: 6000 | cost: 5.542280 | time: 0m 0s
Epoch: 7000 | cost: 4.896063 | time: 0m 0s
Epoch: 8000 | cost: 5.807316 | time: 0m 0s
Epoch: 9000 | cost: 0.938382 | time: 0m 0s
Epoch: 10000 | cost: 4.497680 | time: 0m 0s


# Negative Sampling

## Unigram distribution

In [21]:
Z = 0.001

from collections import Counter

word_count = Counter(flatten(corpus))
num_total_words = sum([c for w, c in word_count.items()])

In [24]:
word_count['the']

22

In [25]:
num_total_words 

297

In [26]:
unigram_table = []

for vo in vocab:
    unigram_table.extend([vo] * int(((word_count[vo]/num_total_words)**0.75)/Z))

In [27]:
Counter(unigram_table)

Counter({'': 39,
         '2018).': 13,
         'manuel,': 13,
         'cornerstone': 13,
         'foundation': 23,
         'governments.': 13,
         'commercial': 13,
         'provide': 13,
         'society': 13,
         'considering': 13,
         'governments': 13,
         'have': 60,
         'invest': 13,
         'new': 13,
         '(algebri': 13,
         '2012).': 13,
         'financial': 13,
         'individuals': 13,
         'definitions': 13,
         'progress': 13,
         'shown': 23,
         'general': 13,
         'key': 13,
         'deng,': 13,
         'that': 46,
         'around': 13,
         'socio': 13,
         '&': 23,
         'a': 39,
         'result': 23,
         'researches': 13,
         'built': 13,
         'quickly': 13,
         'sectors,': 13,
         'how': 13,
         'been': 13,
         'things': 13,
         'is': 39,
         'moldova': 13,
         'found': 13,
         'initiatives': 13,
         'refers': 13,
         'g

## Negative Sampling

In [28]:
import random

def prepare_sequence(seq, word2index):
    idxs = list(map(lambda w: word2index[w] if word2index.get(w) is not None else word2index["<UNK>"], seq))
    return torch.LongTensor(idxs)

def negative_sampling(targets, unigram_table, k):
    batch_size = targets.size(0)
    neg_samples = []
    for i in range(batch_size):
        nsample = []
        target_index = targets[i].item()
        while len(nsample) < k: # num of sampling
            neg = random.choice(unigram_table)
            if word2index[neg] == target_index:
                continue
            nsample.append(neg)
        neg_samples.append(prepare_sequence(nsample, word2index).view(1, -1))
    
    return torch.cat(neg_samples)

### Testing

In [33]:
# input_batch  = torch.Tensor(input_batch)
# target_batch = torch.LongTensor(target_batch)

In [30]:
type(input_batch)

torch.Tensor

In [31]:
target_batch.shape

torch.Size([2, 1])

In [32]:
num_neg = 3
negative_sampling(target_batch, unigram_table, num_neg)

tensor([[138,  98, 128],
        [149, 102,  37]])

## Model

In [34]:
class SkipgramNegSampling(nn.Module):
    
    def __init__(self, vocab_size, emb_size):
        super(SkipgramNegSampling, self).__init__()
        self.embedding_v = nn.Embedding(vocab_size, emb_size) # center embedding
        self.embedding_u = nn.Embedding(vocab_size, emb_size) # out embedding
        self.logsigmoid = nn.LogSigmoid()
                    
    def forward(self, center_words, target_words, negative_words):
        center_embeds = self.embedding_v(center_words) # [batch_size, 1, emb_size]
        target_embeds = self.embedding_u(target_words) # [batch_size, 1, emb_size]
        neg_embeds    = -self.embedding_u(negative_words) # [batch_size, num_neg, emb_size]
        
        positive_score = target_embeds.bmm(center_embeds.transpose(1, 2)).squeeze(2)
        #[batch_size, 1, emb_size] @ [batch_size, emb_size, 1] = [batch_size, 1, 1] = [batch_size, 1]
        
        negative_score = neg_embeds.bmm(center_embeds.transpose(1, 2))
        #[batch_size, k, emb_size] @ [batch_size, emb_size, 1] = [batch_size, k, 1]
        
        loss = self.logsigmoid(positive_score) + torch.sum(self.logsigmoid(negative_score), 1)
                
        return -torch.mean(loss)
    
    def prediction(self, inputs):
        embeds = self.embedding_v(inputs)
        
        return embeds

## Training

In [37]:
len(vocab)

178

In [38]:
batch_size     = 2 # mini-batch size
embedding_size = 2 #so we can later plot
model          = SkipgramNegSampling(len(vocab), embedding_size)
num_neg        = 10 # num of negative sampling

optimizer = optim.Adam(model.parameters(), lr=0.001)

In [39]:
model

SkipgramNegSampling(
  (embedding_v): Embedding(178, 2)
  (embedding_u): Embedding(178, 2)
  (logsigmoid): LogSigmoid()
)

In [40]:
def epoch_time(start_time, end_time):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs

In [41]:
import time

# Training
num_epochs = 10000
for epoch in range(num_epochs):
    
    start = time.time()
    
    input_batch, target_batch = random_batch(batch_size, corpus)
    
    #input_batch: [batch_size, 1]
    input_batch = torch.LongTensor(input_batch)
    
    #target_batch: [batch_size, 1]
    target_batch = torch.LongTensor(target_batch)
    
    #negs_batch:   [batch_size, num_neg]
    negs_batch = negative_sampling(target_batch, unigram_table, num_neg)
    
    optimizer.zero_grad()
        
    loss = model(input_batch, target_batch, negs_batch)
    
    end = time.time()
    
    epoch_mins, epoch_secs = epoch_time(start, end)
    
    loss.backward()
    optimizer.step()

    if (epoch + 1) % 1000 == 0:
        print(f"Epoch: {epoch + 1} | cost: {loss:.6f} | time: {epoch_mins}m {epoch_secs}s")

Epoch: 1000 | cost: 10.828545 | time: 0m 0s
Epoch: 2000 | cost: 6.657749 | time: 0m 0s
Epoch: 3000 | cost: 10.185398 | time: 0m 0s
Epoch: 4000 | cost: 7.603157 | time: 0m 0s
Epoch: 5000 | cost: 7.236413 | time: 0m 0s
Epoch: 6000 | cost: 7.895445 | time: 0m 0s
Epoch: 7000 | cost: 7.785011 | time: 0m 0s
Epoch: 8000 | cost: 7.051541 | time: 0m 0s
Epoch: 9000 | cost: 7.138447 | time: 0m 0s
Epoch: 10000 | cost: 6.980354 | time: 0m 0s
