<a href="https://colab.research.google.com/github/ariahosseini/DeepML/blob/main/030_PyTorch_Proj_Thirty.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# utils
import os, sys, collections, time
import math, random
import zipfile
import numpy as np
from pathlib import Path
# sklearn
from sklearn.feature_extraction.text import CountVectorizer
# torch
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader

## Word2Vec

In [None]:
!mkdir data
%cd data
!wget https://raw.githubusercontent.com/tomsercu/lstm/master/data/ptb.train.txt

/content/data
--2024-05-20 17:35:54--  https://raw.githubusercontent.com/tomsercu/lstm/master/data/ptb.train.txt
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.110.133, 185.199.109.133, 185.199.111.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.110.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 5101618 (4.9M) [text/plain]
Saving to: ‘ptb.train.txt’


2024-05-20 17:35:55 (60.1 MB/s) - ‘ptb.train.txt’ saved [5101618/5101618]



In [None]:
ROOT_DIR='/content'
data_path = os.path.join(ROOT_DIR,'data/')
file = 'ptb.train.txt'
with open(data_path+file, 'r') as f:
    lines = f.readlines()
    raw_dataset = [st.split() for st in lines]
'# sentences: %d' % len(raw_dataset)

'# sentences: 42068'

In [None]:
lines[0]

' aer banknote berlitz calloway centrust cluett fromstein gitano guterman hydro-quebec ipo kia memotec mlx nahb punts rake regatta rubens sim snack-food ssangyong swapo wachter \n'

In [None]:
for st in raw_dataset[:3]:
    print(len(st), st[:5])

24 ['aer', 'banknote', 'berlitz', 'calloway', 'centrust']
15 ['pierre', '<unk>', 'N', 'years', 'old']
11 ['mr.', '<unk>', 'is', 'chairman', 'of']


In [None]:
counter = collections.Counter([tk for st in raw_dataset for tk in st])
counter = dict(filter(lambda x: x[1] >= 5, counter.items()))

In [None]:
counter['the'], counter['N'], counter['<unk>']

(50770, 32481, 45020)

In [None]:
idx_to_token = [tk for tk, _ in counter.items()]
token_to_idx = {tk: idx for idx, tk in enumerate(idx_to_token)}
dataset = [[token_to_idx[tk] for tk in st if tk in token_to_idx]
           for st in raw_dataset]
num_tokens = sum([len(st) for st in dataset])
'# tokens: %d' % num_tokens

'# tokens: 887100'

In [None]:
idx_to_token[:5], token_to_idx['consensus'], token_to_idx['pierre']

(['pierre', '<unk>', 'N', 'years', 'old'], 4827, 0)

In [None]:
dataset[:3]

[[],
 [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 2],
 [14, 1, 15, 16, 17, 1, 18, 7, 19, 20, 21]]

In [None]:
def discard(idx):
    return random.uniform(0, 1) < 1 - math.sqrt(1e-4 / counter[idx_to_token[idx]] * num_tokens)

subsampled_dataset = [[tk for tk in st if not discard(tk)] for st in dataset]
'# tokens: %d' % sum([len(st) for st in subsampled_dataset])

'# tokens: 375975'

In [None]:
def compare_counts(token):
    return '# of "%s": before=%d, after=%d' % (token, sum(
        [st.count(token_to_idx[token]) for st in dataset]), sum(
        [st.count(token_to_idx[token]) for st in subsampled_dataset]))

compare_counts('the'), compare_counts('join')

('# of "the": before=50770, after=2121', '# of "join": before=45, after=45')

In [None]:
def get_centers_and_contexts(dataset, max_window_size):
    centers, contexts = [], []
    for st in dataset:
        if len(st) < 2:
            continue
        centers += st
        for center_i in range(len(st)):
            window_size = random.randint(1, max_window_size)
            indices = list(range(max(0, center_i - window_size),
                                 min(len(st), center_i + 1 + window_size)))
            indices.remove(center_i)
            contexts.append([st[idx] for idx in indices])
    return centers, contexts

In [None]:
tiny_dataset = [[0, 2, 1, 2, 3, 5, 4, 5, 6], list(range(7, 10))]
print('dataset', tiny_dataset)
for center, context in zip(*get_centers_and_contexts(tiny_dataset, 3)):
    print('center', center, 'has contexts', context)

dataset [[0, 2, 1, 2, 3, 5, 4, 5, 6], [7, 8, 9]]
center 0 has contexts [2, 1, 2]
center 2 has contexts [0, 1, 2]
center 1 has contexts [0, 2, 2, 3]
center 2 has contexts [0, 2, 1, 3, 5, 4]
center 3 has contexts [1, 2, 5, 4]
center 5 has contexts [1, 2, 3, 4, 5, 6]
center 4 has contexts [2, 3, 5, 5, 6]
center 5 has contexts [5, 4, 6]
center 6 has contexts [5, 4, 5]
center 7 has contexts [8, 9]
center 8 has contexts [7, 9]
center 9 has contexts [7, 8]


In [None]:
all_centers, all_contexts = get_centers_and_contexts(subsampled_dataset, 5)

In [None]:
def get_negatives(all_contexts, sampling_weights, K):
    all_negatives, neg_candidates, i = [], [], 0
    population = list(range(len(sampling_weights)))
    for contexts in all_contexts:
        negatives = []
        while len(negatives) < len(contexts) * K:
            if i == len(neg_candidates):
                # an index of k words is randomly generated as noise words based on the weight of each word (sampling_weights)
                # for efficient calculation, k can be set slightly larger
                i, neg_candidates = 0, random.choices(population, sampling_weights, k=int(1e5))
            neg, i = neg_candidates[i], i + 1
            if neg not in set(contexts): # noise words cannot be context words
                negatives.append(neg)
        all_negatives.append(negatives)
    return all_negatives

sampling_weights = [counter[w]**0.75 for w in idx_to_token]
all_negatives = get_negatives(all_contexts, sampling_weights, 5)

In [None]:
all_negatives[0], all_contexts[0]

([84, 83, 108, 7972, 628, 1803, 264, 7554, 3889, 1366], [3, 4])

In [None]:
class PTB_dataset(Dataset):

    def __init__(self, all_centers, all_contexts, all_negatives):
        self.all_centers, self.all_contexts_negatives, self.all_masks, self.all_labels = self.batchify(list(zip(all_centers,all_contexts,all_negatives)))

    def __len__(self):
        return len(self.all_centers)

    def __getitem__(self,idx):
        return self.all_centers[idx], self.all_contexts_negatives[idx], self.all_masks[idx], self.all_labels[idx]

    def batchify(self,data):
        max_len = max(len(c) + len(n) for _, c, n in data)
        centers, contexts_negatives, masks, labels = [], [], [], []
        for center, context, negative in data:
            cur_len = len(context) + len(negative)
            centers += [center]
            contexts_negatives += [context + negative + [0] * (max_len - cur_len)]
            masks += [[1] * cur_len + [0] * (max_len - cur_len)]
            labels += [[1] * len(context) + [0] * (max_len - len(context))]
        return (torch.tensor(centers).view((-1, 1)), torch.tensor(np.array(contexts_negatives)),
            torch.tensor(np.array(masks)), torch.tensor(np.array(labels)))

In [None]:
ptbdata = PTB_dataset(all_centers, all_contexts, all_negatives)
ptbdata[1]

(tensor([3]),
 tensor([   0,    4,    5,    6,   11,   12,  885,  662,   14, 3469,    7, 1336,
          376,  336, 6799,  353,  286,  519,  299, 1729,   74, 2155,  317, 5695,
         2599,  226, 4261, 3099, 1908, 3891,  649, 6650, 5348, 6280, 4711, 1527,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0]),
 tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]),
 tensor([1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]))

In [None]:
batch_size = 512
data_loader = DataLoader(ptbdata, batch_size, shuffle=True, num_workers=4)
for batch in data_loader:
    for name, data in zip(['centers', 'contexts_negatives', 'masks', 'labels'], batch):
        print(name, 'shape:', data.shape)
    break



centers shape: torch.Size([512, 1])
contexts_negatives shape: torch.Size([512, 60])
masks shape: torch.Size([512, 60])
labels shape: torch.Size([512, 60])


In [None]:
class ScaledEmbedding(nn.Embedding): # Skip-Gram model
    """
    Embedding layer that initialises its values
    to using a normal variable scaled by the inverse
    of the emedding dimension.
    """
    def reset_parameters(self):
        """
        Initialize parameters.
        """
        self.weight.data.normal_(0, 1.0 / self.embedding_dim)
        if self.padding_idx is not None:
            self.weight.data[self.padding_idx].fill_(0)

class Skip_gram(nn.Module):
    def __init__(self, input_dim, embed_size = 100):
        super(Skip_gram, self).__init__()
        self.input_dim = input_dim
        self.embed_size = embed_size
        self.central_emb = ScaledEmbedding(self.input_dim,self.embed_size)
        self.context_emb = ScaledEmbedding(self.input_dim,self.embed_size)

    def forward(self, icent, icont):
        cent_emb = self.central_emb(icent)
        cont_emb = self.context_emb(icont)
        return torch.einsum('bij,bkj -> bik' , cent_emb, cont_emb)

In [None]:
net = Skip_gram(len(idx_to_token))

In [None]:
net(torch.tensor([0,1,3]).unsqueeze(1),torch.tensor([[0,0],[0,0],[0,0]]))

tensor([[[ 5.8173e-05,  5.8173e-05]],

        [[-8.0108e-04, -8.0108e-04]],

        [[-1.4879e-04, -1.4879e-04]]], grad_fn=<ViewBackward0>)

In [None]:
loss_fn = nn.BCEWithLogitsLoss(reduction='none')
def criterion(pred, label, mask):
    return (loss_fn(pred, label)*mask).sum(1)/mask.sum(1)

In [None]:
pred = torch.tensor([[1.5, 0.3, -1, 2], [1.1, -0.6, 2.2, 0.4]])
# 1 and 0 in the label variables label represent context words and the noise words, respectively
label = torch.tensor([[1, 0, 0, 0], [1, 1, 0, 0]]).type(torch.FloatTensor)
mask = torch.tensor([[1, 1, 1, 1], [1, 1, 1, 0]]).type(torch.FloatTensor)  # mask variable
criterion(pred, label, mask)

tensor([0.8740, 1.2100])

In [None]:
optimizer = torch.optim.Adam(net.parameters(), lr=0.005)

In [None]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
net = net.to(device)

In [None]:
def train(n_epochs):
    for epoch in range(n_epochs):
        start, loss = time.time(), 0
        for batch in data_loader:
            cent, cont, mas, lab = batch
            cent = cent.to(device)
            cont = cont.to(device)
            mas = mas.to(device)
            lab = lab.type(torch.FloatTensor).to(device)
            pred = net(cent,cont).squeeze()
            optimizer.zero_grad()
            curr_loss = criterion(pred,lab,mas).mean()
            curr_loss.backward()
            optimizer.step()
            loss += curr_loss.item()
        print('epoch %d, loss %.2f, time %.2fs'
              % (epoch + 1, loss, time.time() - start))

In [None]:
train(6)

  self.pid = os.fork()
  self.pid = os.fork()


epoch 1, loss 334.34, time 40.09s
epoch 2, loss 290.13, time 33.05s
epoch 3, loss 259.21, time 33.23s
epoch 4, loss 237.44, time 35.43s
epoch 5, loss 224.14, time 35.12s
epoch 6, loss 215.36, time 34.01s


In [None]:
def get_similar_tokens(query_token, k, W):
    x = W[token_to_idx[query_token]]
    cos = torch.matmul(W,x) / torch.sqrt(torch.sum(W*W,1)*torch.sum(x*x)+1e-9)
    _,topk = torch.topk(cos, k=k+1,)
    for i in topk[1:]: # remove the input words
        print('cosine sim=%.3f: %s' % (cos[i], (idx_to_token[i])))
get_similar_tokens('chip', 3, net.central_emb.weight.data)

cosine sim=0.608: intel
cosine sim=0.528: chips
cosine sim=0.499: workstation
