# LLM - HW2 : Word2Vec Model Notebook - **Alexandre NGAU** *(November 8, 2023)*

# Environment setting

In [1]:
# ========== installing packages ==========

%%capture
!pip install transformers datasets

In [2]:
# ========== impoting libraries ==========

import numpy as np
import torch
import torch.nn.functional as F
import torch.nn as nn
import math
import random
from pprint import pprint
from torch.utils.data import DataLoader
from tabulate import tabulate
from datasets import load_dataset

from tqdm.notebook import tqdm
from transformers import BertTokenizer

# ========== designating the device (GPU when available) ==========

DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(DEVICE)

cpu


# Setting some global variables (1 of 2)

In [3]:
R = 4 # radius
K = 6 # ratio
batch_size = 64
n_epochs = 10

First cells will be the same than the ones of the lab on text convolution.

# Data loading


In [4]:
dataset = load_dataset("scikit-learn/imdb", split="train")
print(dataset)

Downloading readme:   0%|          | 0.00/1.22k [00:00<?, ?B/s]

Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/66.2M [00:00<?, ?B/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Dataset({
    features: ['review', 'sentiment'],
    num_rows: 50000
})


# Pre-processing / Tokenization

This is a very important step. It may be boring but very important. In this session we will be lazy, but in real life, the time spent on inspecting and cleaning data is never wasted. It is true for text, but also for everything.



In PyTorch, everything is tensor. Words are replaced by indices. A sentence, is therefore a sequence of indices (long integers). In the first HW, you constructed a `WhiteSpaceTokenizer`. Here we will use an already built tokenizer. It is more appropriate to transformers. It relies on sub-word units, and converts everything in lower case. This is not always the best choice, but here it will be sufficient. To quote the documentation, this tokenizer allows you to:
- Tokenize (splitting strings in sub-word token strings), convert tokens strings to ids and back, and encoding/decoding (i.e., tokenizing and converting to integers).
- Add new tokens to the vocabulary in a way that is independent of the underlying structure (BPE, SentencePiece…).
- Manage special tokens (like mask, beginning-of-sentence, etc.): adding them, assigning them to attributes in the tokenizer for easy access and making sure they are not split during tokenization.

Here we are going to use the tokenizer from the well known Bert model, that we can directly download.

In [5]:
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased", do_lower_case=True)

Downloading (…)okenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

In [6]:
def preprocessing_fn(x, tokenizer):
    x["review_ids"] = tokenizer(
        x["review"],
        add_special_tokens=False,
        truncation=True,
        max_length=256,
        padding=False,
        return_attention_mask=False,
    )["input_ids"]
    x["label"] = 0 if x["sentiment"] == "negative" else 1
    return x

Same cell than in the lab session.

🚧 **TODO** 🚧

Read the documentation about HuggingFace dataset and complete the code below.
You should:
- Shuffle the dataset
- For computational reasons, use only a total of **5000 samples**.
- Tokenize the dataset with the `preprocessing_fn`. (*Hint: use the `Dataset.map` method from HuggingFace*).
- Keep only columns `review_ids` and `label`.
- Make a train/validation split, (**80% / 20%**). Call these dataset `train_set` and `valid_set`.


In [7]:
n_samples = 5000  # the number of training example

# We first shuffle the data !
dataset = dataset.shuffle()

# Select 5000 samples
split_dataset = dataset.select(range(n_samples))

# Tokenize the dataset
tok_dataset = split_dataset.map(preprocessing_fn,
                                fn_kwargs={"tokenizer": tokenizer})

# Remove useless columns
tok_dataset = tok_dataset.select_columns(["review_ids", "label"])

# Split the train and validation
tok_dataset = tok_dataset.train_test_split(test_size=0.2)

# ========== splitting the train/validation dataset ==========

document_train_set = tok_dataset["train"]
document_valid_set = tok_dataset["test"]

Map:   0%|          | 0/5000 [00:00<?, ? examples/s]

In [8]:
print(len(document_train_set), len(document_valid_set))

4000 1000


In [9]:
pprint(document_train_set[0])

{'label': 1,
 'review_ids': [1045,
                2318,
                3666,
                1996,
                2265,
                2013,
                1996,
                2034,
                2161,
                1010,
                1998,
                2012,
                1996,
                2927,
                1045,
                2001,
                3492,
                18386,
                2055,
                2009,
                1012,
                2434,
                3185,
                2001,
                2785,
                1997,
                24282,
                1010,
                1998,
                1045,
                2001,
                2074,
                2559,
                2005,
                2070,
                16596,
                1011,
                10882,
                2265,
                2096,
                3403,
                2005,
                1996,
                18667,
              

# `extract_words_contexts` function

In [10]:
def extract_words_contexts(list_of_ids_of_txt_doc, R=R):
  w_ids = list_of_ids_of_txt_doc
  c_plus_ids = []
  for w in range(len(w_ids)) : # for every word w in the list of words
    c_plus = []
    for r in range(w-R, w+R+1): # for every word r in the radius around the selected w word
      if r < 0 :
        c_plus.append(0) # replacing the out of range words by zeros
      if r >= 0 and r < len(w_ids) and r != w:
        c_plus.append(w_ids[r])
      if r >= len(w_ids):
        c_plus.append(0) # replacing the out of range words by zeros
    c_plus_ids.append(c_plus)
  return w_ids, c_plus_ids

In the Bert Tokenizer, the padding index is 0. That is why, in order to handle the borders, it was chosen to pad the positive context vectors using zeros. This way, the positive context vectors are of the same size.


In the cell below, the length of every positive context vector is verified to be exactly the same.

In [11]:
w_test, c_test = extract_words_contexts(document_train_set[0]["review_ids"])
etalon = len(c_test[0])
for i in c_test:
  assert len(i) == etalon

# `flatten_dataset_to_list` function

The idea for this function is to concatenate the outputs of the `extract_words_contexts` function, applied at every document/text that is in the dataset. The `flatten_dataset_to_list` function will then output two lists of the same length : one of word ids, the other being a list of positive context vectors associated to each word.

In [12]:
def flatten_dataset_to_list(data_set, R=R):
  W = []
  C = []
  for i in range(len(data_set)):
    w, c = extract_words_contexts(data_set[i]["review_ids"])
    W += w
    C += c
  return W, C

In [13]:
# ========== applying flatten_dataset_to_list to the train/validation datasets ==========

flatten_document_train_set = flatten_dataset_to_list(document_train_set)
flatten_document_valid_set = flatten_dataset_to_list(document_valid_set)

In the cell below, the length of the list of words extracted from the dataset is verified to be the same as the length of the list of positive context vectors associated to each word.

In [14]:
assert len(flatten_document_train_set[0]) == len(flatten_document_train_set[1])

# PyTorch Dataset class creating

In [15]:
from torch.utils.data import Dataset


class set(Dataset):
    def __init__(self, flatten_document_set):
        self.word = flatten_document_set[0]
        self.positive_context = flatten_document_set[1]

    def __len__(self):
        return len(self.word)

    def __getitem__(self, idx: int):
        word = self.word[idx]
        positive_context = self.positive_context[idx]
        return word, positive_context

# ========== embedding the flattened datasests using the PyTorch Dataset class newly created ==========

train_set = set(flatten_document_train_set)
valid_set = set(flatten_document_valid_set)

In [16]:
print(f" The number of words in this dataset is {len(train_set)}")
print(f"First word id : {train_set[0][0]} | First positive context vector : {train_set[0][1]}")

 The number of words in this dataset is 824203
First word id : 1045 | First positive context vector : [0, 0, 0, 0, 2318, 3666, 1996, 2265]


Here above, the positive context vector corresponding to the first word is effectively of size 2*4=8 (the radius R being 4). It is also noticeable that the first half of the vector is composed of zeros for padding, as the first word of the dataset is also the first word of a text, this makes sense, showing that everything is computed correctly.

# `collate_fn` function

In [17]:
vocab_list = list(tokenizer.get_vocab().values())


def collate_fn(batch, R=R, K=K):
  dict = {}
  word_id = []
  positive_context_ids = []
  negative_context_ids = []
  for i in range(len(batch)):
    word_id.append(batch[i][0])
    positive_context_ids.append(batch[i][1])
    negative_context_ids.append(random.sample(vocab_list, 2*K*R)) # adding the negative context vector
  dict["word_id"] = torch.tensor(word_id)
  dict["positive_context_ids"] = torch.tensor(positive_context_ids)
  dict["negative_context_ids"] = torch.tensor(negative_context_ids)
  return dict

# Wrapping in a DataLoader

In [18]:
from torch.utils.data import DataLoader


for batchsize in range(1,3):
  dataloader = DataLoader(
      dataset=train_set, batch_size=batchsize, collate_fn=collate_fn
      )
  for _batch in dataloader:
    print(f"R={R}",
          f"K={K}",
          f"word_id_tensor = {_batch['word_id'].shape}",
          f"positive_context_ids_tensor = {_batch['positive_context_ids'].shape}",
          f"negative_context_ids_tensor = {_batch['negative_context_ids'].shape}"
         )
    break


R=4 K=6 word_id_tensor = torch.Size([1]) positive_context_ids_tensor = torch.Size([1, 8]) negative_context_ids_tensor = torch.Size([1, 48])
R=4 K=6 word_id_tensor = torch.Size([2]) positive_context_ids_tensor = torch.Size([2, 8]) negative_context_ids_tensor = torch.Size([2, 48])


As can be seen above, for a batch size of 1 and 2 (1 and 2 words in the batch), the positive context vectors associated are of size 2x4=8 (2xR), and the negative context vectors associated are of size 2x4x6=48 (2xRxK).

# Word2Vec model

In [19]:
class Word2Vec(nn.Module):
  def __init__(self, vocab_size, embedding_dimension):
    super().__init__()
    self.embedding_words = nn.Embedding(vocab_size, embedding_dimension)
    self.embedding_context = nn.Embedding(vocab_size, embedding_dimension)

  def forward(self, target_word_ids, context_word_ids):
    embedded_target = self.embedding_words(target_word_ids)
    embedded_context = self.embedding_context(context_word_ids)

    # computation of the sigmoid of the dot product along the batch dimension (dim=2)
    score = torch.sigmoid(torch.sum(embedded_context*embedded_target, dim=2))
    return score

# `validation` function

In [20]:
def validation(model, valid_dataloader):

    # Tracking variables
    total_size = 0
    acc_total = 0
    loss_total = 0
    criterion = nn.BCELoss(reduction = 'none') # setting the reduction to none in order to not compute the mean outside the BCE Loss rather than inside

    # Set model to evaluation mode
    model.eval()

    # ========== Evaluation ==========

    with torch.no_grad():
        for batch in tqdm(valid_dataloader):

            # Pushing the batches to the computing device
            word_id = batch["word_id"].to(DEVICE)
            positive_context_ids = batch["positive_context_ids"].to(DEVICE)
            negative_context_ids = batch["negative_context_ids"].to(DEVICE)

            # Calculating the loss
            """ Note that the .unsqueeze(1) is used so that the score can be calculated. """
            pred_pos = model(word_id.unsqueeze(1), positive_context_ids) # positive context prediction
            pred_neg = model(word_id.unsqueeze(1), negative_context_ids) # negative context prediction
            """ For loss_positive and loss_negative, the mean of all BCE Losses for the positive/negative context predictions is computed. """
            loss_positive = torch.mean(criterion(pred_pos, torch.ones(pred_pos.shape, device=DEVICE)), dim=1)
            loss_negative = torch.mean(criterion(pred_neg, torch.zeros(pred_neg.shape, device=DEVICE)), dim=1)
            loss = torch.mean(loss_positive + loss_negative)
            loss_total += loss.detach().cpu().item()

            # Calculating the accuracy
            """ The threshold for the BCE Loss is set to 0.5. Under, the prediction is considered negative, and over positive """
            acc_positive = (pred_pos.squeeze() > 0.5)
            acc_negative = (pred_neg.squeeze() < 0.5)
            acc_total += acc_positive.int().sum().item()
            acc_total += acc_negative.int().sum().item() # summing the number of Trues
            total_size += acc_positive.numel()
            total_size += acc_negative.numel() # adding up all the predictions done

    # Set the model back to training mode
    model.train()

    return loss_total / len(valid_dataloader), acc_total / total_size

# `training` function

In [21]:
def training(model, batch_size, n_epochs, lr=5e-5):
    optimizer = torch.optim.AdamW(
        model.parameters(),
        lr=lr,
        eps=1e-08,
    )

    train_dataloader = DataLoader(
        train_set, batch_size=batch_size, collate_fn=collate_fn
        )
    valid_dataloader = DataLoader(
        valid_set, batch_size=batch_size, collate_fn=collate_fn
        )

    list_val_acc = []
    list_train_acc = []
    list_train_loss = []
    list_val_loss = []
    criterion = nn.BCELoss(reduction = 'none') # setting the reduction to none in order to not compute the mean outside the BCE Loss rather than inside

    for e in range(n_epochs):

        # ========== Training ==========

        # Set model to training mode
        model.train()

        # Tracking variables
        train_loss = 0
        epoch_train_acc = 0
        total_size = 0

        for batch in tqdm(train_dataloader):
            # Pushing the batches to the computing device
            word_id, positive_context_ids, negative_context_ids = (
                batch["word_id"].to(DEVICE),
                batch["positive_context_ids"].to(DEVICE),
                batch["negative_context_ids"].to(DEVICE),
            )

            optimizer.zero_grad()

            # Forward pass
            output_positive = model(word_id.unsqueeze(1), positive_context_ids)
            output_negative = model(word_id.unsqueeze(1), negative_context_ids)

            # Backward pass

            # Calculating the loss as in the validation function
            loss_positive = torch.mean(criterion(output_positive, torch.ones(output_positive.shape, device=DEVICE)), dim=1)
            loss_negative = torch.mean(criterion(output_negative, torch.zeros(output_negative.shape, device=DEVICE)), dim=1)
            loss = torch.mean(loss_positive + loss_negative)

            loss.backward()
            optimizer.step()
            train_loss += loss.detach().cpu().item()

            # Calculating the accuracy as in the validation function
            acc_positive = (output_positive.squeeze() > 0.5)
            acc_negative = (output_negative.squeeze() < 0.5)
            epoch_train_acc += acc_positive.int().sum().item()
            epoch_train_acc += acc_negative.int().sum().item()
            total_size += acc_positive.numel()
            total_size += acc_negative.numel()

        list_train_acc.append(epoch_train_acc / total_size)
        list_train_loss.append(train_loss / len(train_dataloader))

        # ========== Validation ==========

        l, a = validation(model, valid_dataloader)
        list_val_loss.append(l)
        list_val_acc.append(a)
        print(
            e,
            "\n\t - Train loss: {:.4f}".format(list_train_loss[-1]),
            "Train acc: {:.4f}".format(list_train_acc[-1]),
            "Val loss: {:.4f}".format(l),
            "Val acc:{:.4f}".format(a),
        )

    return list_train_loss, list_train_acc, list_val_loss, list_val_acc

# Setting some global variables (2 of 2)

In [22]:
embedding_dimension = 150
vocab_size = len(tokenizer.get_vocab())

# `save_model` function

In [23]:
def save_model(model, file_path, dimension=embedding_dimension, radius=R, ratio=K, batch=batch_size, epoch=n_epochs):
    file_name = f"model_dim-{dimension}_radius-{radius}_ratio-{ratio}-batch-{batch}-epoch-{epoch}.ckpt"
    torch.save(model.state_dict(), file_path + file_name)

PATH = ''

# Model training and checkpoint saving


Hereafter is the code that has been run for the training of the Word2Vec model. It saves the model checkpoint in a `.ckpt` file in the currently running environment, under the `content` directory. It is then reloaded in order to continue training. The further checkpoint is equivalently stored.

In [None]:
# Training from 0 to 10 epochs
print("STARTING THE TRAINING FROM 0 TO 10 EPOCHS")
model = Word2Vec(vocab_size, embedding_dimension)
model = model.to(DEVICE)
training(model, batch_size, n_epochs)
save_model(model, file_path=PATH, dimension=embedding_dimension, radius=R, ratio=K, batch=batch_size, epoch=n_epochs)

In [None]:
# Continuing training from 10 to 20 epochs
print("STARTING THE TRAINING FROM 10 TO 20 EPOCHS")
saved_model_path = 'model_dim-150_radius-4_ratio-6-batch-64-epoch-10.ckpt'
pretrained_model = Word2Vec(vocab_size, embedding_dimension)
pretrained_model.load_state_dict(torch.load(saved_model_path, map_location=DEVICE))
pretrained_model = pretrained_model.to(DEVICE)
training(pretrained_model, batch_size, n_epochs)
save_model(pretrained_model, file_path=PATH, dimension=embedding_dimension, radius=R, ratio=K, batch=batch_size, epoch=2*n_epochs)

The "# Continuing training from 10 to 20 epochs" cell has been repeated up to a hundred epochs in order to continue the training of the model. They have been erased because they were executed on the LAMSADE servers, not on this notebook in Google Colab, however the subsequent training loss, training accuracy, validation loss and validation accuracy has been logged into a plot that you can see in the associated PDF report, along with its analysis.