This notebook is used to learn the embeddings of words in a dataset using the word2vec model.

In [5]:
from datasets import load_dataset
import torch
import torchtext
import nltk
from nltk.corpus import stopwords
from datasets import load_from_disk
import numpy as np

import pandas as pd
from datasets import Dataset

In [6]:
seed = 257

np.random.seed(seed)
torch.manual_seed(seed)
torch.cuda.manual_seed(seed)
torch.backends.cudnn.deterministic = True

## Prepare the data

We begin by tokenizing and cleaning the data. This process consists of removing punctuation, numbers, and stop words.

In [None]:
# load the dataset
train_data, test_data = load_dataset("yelp_polarity", split=["train", "test"])

In [None]:
# tokenize the dataset
tokenizer = torchtext.data.utils.get_tokenizer("basic_english")


def tokenize(obs, tokenizer, max_length=512):
    """
    Tokenize an observation
    max_length: the maximum length of the tokenized sequence
    """
    return {"tokens": tokenizer(obs["text"])[:max_length]}


In [None]:
# remove stopwords and punctuation
stop_words = stopwords.words("english")


def remove_stopwords(obs):
    """
    Removes stopwords from tokens for each obs in Dataset
    """
    obs["tokens"] = [word for word in obs["tokens"] if word not in stop_words]
    return obs


def remove_punctuation(obs):
    """
    Removes punctuation from tokens for each obs in Dataset
    """
    obs["tokens"] = [word for word in obs["tokens"] if word.isalpha()]
    return obs


def tokenize_and_clean(obs):
    """
    Tokenize, remove stopwords and punctuation from observation
    """
    tokens = tokenizer(obs["text"][:512])
    tokens = [word for word in tokens if word not in stop_words]
    tokens = [word for word in tokens if word.isalpha()]
    return {"tokens": tokens}


# train_data = train_data.map(remove_stopwords)

In [None]:
# tokenizer(train_data[0]["text"][:512])
train_data = train_data.map(tokenize_and_clean)
# test_data = test_data.map(tokenize_and_clean)


In [None]:
# train_data.save_to_disk("/datasets/yelp_polarity_train")
# train_data = load_from_disk("/datasets/yelp_polarity_train/")

Now that our data has been tokenized and cleaned, we can create a validation set.

In [13]:
# validation data
train_valid_data = train_data.train_test_split(test_size=0.25)
train_data = train_valid_data["train"]
valid_data = train_valid_data["test"]

From the training data, we now proceed to create a vocabulary comprised of the training data's unique words (if they appear more than 75 times).

In [11]:
# creating the vocabulary
special_tokens = ["<unk>"]

# setting a minimum frequency for the tokens ... 75 times in 420,000 sentences is not a lot
vocab = torchtext.vocab.build_vocab_from_iterator(
    train_data["tokens"], specials=special_tokens, min_freq=75
)
vocab.set_default_index(vocab["<unk>"])
len(vocab)

11739

Now that we have the vocabulary, we can numerically encode the words in the training data.

In [26]:
def numericalize_example(obs, vocab):
    ids = vocab.lookup_indices(obs["tokens"])
    return {"ids": ids}


train_data = train_data.map(numericalize_example, fn_kwargs={"vocab": vocab})

Map:   0%|          | 0/420000 [00:00<?, ? examples/s]

Now that we have numericalized the data, we can create word pairs for the skip-gram model. Since we're already iterating over the entire dataset, we'll convert the data from indexes to tensors.

In [17]:
# find a way to incorporate transformation to tensors in this process

def get_word_pairs(sentence, window_size=3):
    """
    Generate word pairs from a sentence
    """
    for i, token in enumerate(sentence):
        for j in range(1, window_size + 1):
            if i + j < len(sentence):
                yield (torch.tensor(sentence[i]), torch.tensor(sentence[i + j]))
            if i - j >= 0:
                yield (torch.tensor(sentence[i]), torch.tensor(sentence[i - j]))


def extract_pairs(dataset):
    """
    Extract word pairs from dataset
    """
    pairs = []
    for i, obs in enumerate(dataset):
        pairs.extend(get_word_pairs(obs["tokens"]))
    return pairs

In [18]:
# convert the new training data to a dataset from a DataFrame
new_train = extract_pairs(train_data)

new_train = Dataset.from_pandas(pd.DataFrame(new_train, columns=["x", "y"]))

# convert the new validation data to a dataset from a DataFrame
# new_valid = extract_pairs(valid_data)
# new_valid = Dataset.from_pandas(pd.DataFrame(new_valid, columns=["x", "y"]))

# # convert the new test data to a dataset from a DataFrame
# new_test = extract_pairs(test_data)
# new_test = Dataset.from_pandas(pd.DataFrame(new_test, columns=["x", "y"]))

In [None]:
new_train.save_to_disk("/datasets/yelp_polarity_train_pairs")
# new_train = load_from_disk("/datasets/yelp_polarity_train_pairs")

# new_valid.save_to_disk("/datasets/yelp_polarity_valid_pairs")
# new_valid = load_from_disk("/datasets/yelp_polarity_valid_pairs")

# new_test.save_to_disk("/datasets/yelp_polarity_test_pairs")
# new_test = load_from_disk("/datasets/yelp_polarity_test_pairs")


Now we convert our datasets to PyTorch tensors.

In [7]:
# new_train.save_to_disk("/datasets/yelp_polarity_train_pairs")
new_train = load_from_disk("/datasets/yelp_polarity_train_pairs")


The embedding layer is used to transform our sparse one-hot vector (sparse as most of the elements are 0) into a dense embedding vector (dense as the dimensionality is a lot smaller and all the elements are real numbers). This embedding layer is simply a single fully connected layer. As well as reducing the dimensionality of the input to the RNN, there is the theory that words which have similar impact on the sentiment of the review are mapped close together in this dense vector space. For more information about word embeddings, see here.