In [1]:
import numpy as np
import torch
from collections import defaultdict
from transformers import AutoTokenizer, Trainer
import datasets
import torch.nn as nn
import torch
from datasets import load_dataset, get_dataset_split_names

from torch.utils.data import DataLoader
import torch.optim as optim

In [2]:
device = "cuda:0" if torch.cuda.is_available() else "cpu"
device

'cuda:0'

- Train language model in wikipedia dataset
- Visualize embeddings for a few words
- Freeze embeddings and train linear model on top of embeddings and do IMDB classification

In [12]:
train_dataset = load_dataset("wikitext", 'wikitext-103-v1', split="train")
validation_dataset = load_dataset("wikitext", 'wikitext-103-v1', split="validation")
toy_text = "This is a sentence"

train_dataset['text'][5], len(train_dataset['text'])

Downloading and preparing dataset wikitext/wikitext-103-v1 (download: 181.42 MiB, generated: 522.23 MiB, post-processed: Unknown size, total: 703.64 MiB) to /home/ajrfhp/.cache/huggingface/datasets/wikitext/wikitext-103-v1/1.0.0/a241db52902eaf2c6aa732210bead40c090019a499ceb13bcbfa3f8ab646a126...


Downloading data:   0%|          | 0.00/190M [00:00<?, ?B/s]

Generating test split:   0%|          | 0/4358 [00:00<?, ? examples/s]

Generating train split:   0%|          | 0/1801350 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/3760 [00:00<?, ? examples/s]

Dataset wikitext downloaded and prepared to /home/ajrfhp/.cache/huggingface/datasets/wikitext/wikitext-103-v1/1.0.0/a241db52902eaf2c6aa732210bead40c090019a499ceb13bcbfa3f8ab646a126. Subsequent calls will reuse this data.


Reusing dataset wikitext (/home/ajrfhp/.cache/huggingface/datasets/wikitext/wikitext-103-v1/1.0.0/a241db52902eaf2c6aa732210bead40c090019a499ceb13bcbfa3f8ab646a126)


(" It met with positive sales in Japan , and was praised by both Japanese and western critics . After release , it received downloadable content , along with an expanded edition in November of that year . It was also adapted into manga and an original video animation series . Due to low sales of Valkyria Chronicles II , Valkyria Chronicles III was not localized , but a fan translation compatible with the game 's expanded edition was released in 2014 . Media.Vision would return to the franchise with the development of Valkyria : Azure Revolution for the PlayStation 4 . \n",
 1801350)

In [13]:
def tokenize(tokens):
    return tokenizer.batch_encode_plus(tokens["text"])
    
tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")

In [14]:
tokenized_train_dataset = train_dataset.map(tokenize, batched=True, num_proc=4, remove_columns=["text"])
tokenized_validation_dataset = validation_dataset.map(tokenize, batched=True, num_proc=4, remove_columns=["text"])

        

#0:   0%|          | 0/451 [00:00<?, ?ba/s]

#1:   0%|          | 0/451 [00:00<?, ?ba/s]

#2:   0%|          | 0/451 [00:00<?, ?ba/s]

#3:   0%|          | 0/451 [00:00<?, ?ba/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (553 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (538 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (688 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (537 > 512). Running this sequence through the model will result in indexing errors


       

#0:   0%|          | 0/1 [00:00<?, ?ba/s]

 

#1:   0%|          | 0/1 [00:00<?, ?ba/s]

#2:   0%|          | 0/1 [00:00<?, ?ba/s]

#3:   0%|          | 0/1 [00:00<?, ?ba/s]

In [15]:
def prepare_skipgram_context(batch_text, negative_sampling_rate=5, window_size=3, max_samples = 100000):
    X, Y = [], []
    batch_ids = batch_text['input_ids']
    for sentence in batch_ids:
        if len(X) > max_samples:
            break
        for i, word in enumerate(sentence):
            for j in range(i-window_size,i+window_size+1):
                if j > 0 and j < len(sentence) and i != j:
                    X.append((sentence[i], sentence[j]))
                    Y.append(1)
                    for k in range(negative_sampling_rate):
                        negative_sample = np.random.randint(tokenizer.vocab_size)
                        if sentence[i] != negative_sample:
                            X.append((sentence[i], negative_sample))
                            Y.append(0)
    X = np.array(X).reshape((-1, 2))
    Y = np.array(Y)
    indices = np.random.permutation(len(X))
    return { "inputs" : X[indices], "outputs" : Y[indices]}
    
    
skipgram_train_dataset = tokenized_train_dataset.map(prepare_skipgram_context, batched=True, remove_columns=['input_ids','token_type_ids','attention_mask'])
skipgram_train_dataset.set_format("torch")

skipgram_validation_dataset = tokenized_validation_dataset.map(prepare_skipgram_context, batched=True, remove_columns=['input_ids','token_type_ids','attention_mask'])
skipgram_validation_dataset.set_format("torch")

  0%|          | 0/1802 [00:00<?, ?ba/s]

In [None]:
train_dataloader = DataLoader(
    skipgram_train_dataset, shuffle=True, batch_size=2000,
)
eval_dataloader = DataLoader(
    skipgram_validation_dataset, batch_size=2000
)

In [None]:
class WordToVec(nn.Module):
    def __init__(self, vocab_size, embedding_dim) -> None:
        super(WordToVec, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.context = nn.Embedding(vocab_size, embedding_dim)
    
    def forward(self, x):
        v = self.embedding(x[:,0])
        u = self.context(x[:,1])
        score = torch.sum(u * v, dim=-1)
        return nn.Sigmoid()(score)

word_to_vec = WordToVec(tokenizer.vocab_size, 100)
random_input = torch.tensor((
    [10, 5],
    [7, 3],
    [9, 2]
))
word_to_vec(random_input)

tensor([0.8225, 0.1322, 0.3112], grad_fn=<SigmoidBackward0>)

In [None]:
len(train_dataloader)

In [None]:
def train(model, train_dataloader, test_dataloader, optimizer, logging_interval=100, epochs=20):
    model.train()
    for epoch in range(epochs):
        running_loss = 0
        for i, batch in enumerate(train_dataloader):
            inputs, outputs = batch['inputs'].to(device), batch['outputs'].to(device)
            predictions = model.forward(inputs)
            loss = nn.functional.binary_cross_entropy(predictions, outputs.float())
            optimizer.zero_grad()
            loss.backward()
            running_loss += loss.item()
            optimizer.step()
            if i and i%logging_interval == 0:
                running_loss /= logging_interval
                print(f'epoch {epoch}, number of batches processed {i} of {len(train_dataloader)}, training_loss = {running_loss}')
                running_loss = 0
        torch.save(word_to_vec.state_dict(), f'./word_to_vec{epoch}.pt')
        print(f'acc epoch{epoch} {eval(model, test_dataloader)}')

def eval(model, test_dataloder, logging_interval=100):
    correct, total = 0, 0
    model.eval()
    for i, batch in enumerate(test_dataloder):
        inputs, outputs = batch['inputs'].to(device), batch['outputs'].to(device)
        predictions = model.forward(inputs)
        predictions[predictions.le(0.5)] = 0
        predictions[predictions.ge(0.5)] = 1
        correct += len(predictions[predictions == outputs])
        total += len(outputs)
        if i % logging_interval == 0:
            print(i, correct/total)
        if i > 10:
            break
    return correct/total

        

word_to_vec = WordToVec(tokenizer.vocab_size, 100).to(device)
optimizer = optim.Adam(word_to_vec.parameters(), lr=1e-3)
eval(word_to_vec, eval_dataloader)
train(word_to_vec, train_dataloader, eval_dataloader, optimizer)
eval(word_to_vec, eval_dataloader)

0 0.505
epoch 0, number of batches processed 100 of 3821, training_loss = 7.247525353431701
epoch 0, number of batches processed 200 of 3821, training_loss = 6.939441113471985
epoch 0, number of batches processed 300 of 3821, training_loss = 6.74136513710022
epoch 0, number of batches processed 400 of 3821, training_loss = 6.602007741928101
epoch 0, number of batches processed 500 of 3821, training_loss = 6.479649324417114
epoch 0, number of batches processed 600 of 3821, training_loss = 6.25370918750763
epoch 0, number of batches processed 700 of 3821, training_loss = 6.176643567085266
epoch 0, number of batches processed 800 of 3821, training_loss = 5.959899754524231
epoch 0, number of batches processed 900 of 3821, training_loss = 5.885436158180237
epoch 0, number of batches processed 1000 of 3821, training_loss = 5.7955278968811035
epoch 0, number of batches processed 1100 of 3821, training_loss = 5.495901441574096
epoch 0, number of batches processed 1200 of 3821, training_loss = 

KeyboardInterrupt: 

In [10]:
def get_vector(word):
    int_id = torch.tensor([tokenizer(word)['input_ids']]).to(device)
    vector = word_to_vec.embedding.forward(int_id)[0][1]
    return vector.reshape((-1, ))

def get_vector_similarity(a, b):
    num = a.dot(b)
    den = torch.sqrt((a*a).sum() * (b*b).sum())
    return num / den
    
get_vector_similarity( get_vector('computer'), get_vector('information')), get_vector_similarity(get_vector('computer'), get_vector('football'))

(tensor(0.1522, device='cuda:0', grad_fn=<DivBackward0>),
 tensor(0.1385, device='cuda:0', grad_fn=<DivBackward0>))

In [None]:
get_vector_similarity(torch.tensor([1, 2, 3]), torch.tensor([-1, -2, -3]))