In [2]:
import math

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.nn import TransformerEncoder, TransformerEncoderLayer

In [3]:
from torchtext.datasets import WikiText2
from torchtext.data.utils import get_tokenizer
from collections import Counter
from torchtext.vocab import Vocab

In [4]:
train_iter = WikiText2(split='train')
tokenizer = get_tokenizer('basic_english')

In [5]:
counter = Counter()
for line in train_iter:
    counter.update(tokenizer(line))
vocab = Vocab(counter)

In [130]:
import os
from collections import Counter

import torch
from torch.utils.data import Dataset
from torchtext.data.utils import get_tokenizer
from torchtext.datasets import WikiText2
from torchtext.vocab import Vocab


def generate_vocabulary(data_dir="data/"):
    raw_text_iter = WikiText2(root=os.path.join(data_dir, "wikitext"), split="train")
    tokenizer = get_tokenizer("basic_english")
    counter = Counter()
    for x in raw_text_iter:
        counter.update(tokenizer(x))
    return Vocab(counter)


class WikiTextDataset(Dataset):
    """Wikipedia Language Modelling."""

    def __init__(self, data_dir, split, vocab, seq_len=50):
        super().__init__()

        self.data_dir = data_dir
        self.split = split
        self.vocab = vocab
        self.seq_len = seq_len
        self.tokenizer = get_tokenizer("basic_english")

        data_iter = WikiText2(root=os.path.join(data_dir, "wikitext"), split=split)
        self.train_data = self.data_process(data_iter)

    def data_process(self, raw_text_iter):
        data = [
            torch.tensor([self.vocab[token] for token in self.tokenizer(item)], dtype=torch.long)
            for item in raw_text_iter
        ]
        data = [x for x in data if x.numel() > 0]
        return torch.cat((data))

    def __getitem__(self, index):
        data = self.train_data[index * self.seq_len : (index + 1) * self.seq_len]
        target = self.train_data[(index + 1) * self.seq_len : (index + 2) * self.seq_len]
        return data, target

    def __len__(self):
        return len(self.train_data) // self.seq_len - 1


In [131]:
from typing import Optional

from pytorch_lightning import LightningDataModule
from torch.utils.data import DataLoader, Dataset


class WikiTextDataModule(LightningDataModule):
    """Wikipedia Language Modelling."""

    def __init__(
        self,
        data_dir: str = "data/",
        batch_size: int = 64,
        seq_len: int = 30,
        num_workers: int = 0,
        pin_memory: bool = False,
        drop_last: bool = False,
    ):
        super().__init__()

        self.data_dir = data_dir
        self.batch_size = batch_size
        self.seq_len = seq_len
        self.num_workers = num_workers
        self.pin_memory = pin_memory
        self.drop_last = drop_last

        self.data_train: Optional[Dataset] = None
        self.data_val: Optional[Dataset] = None
        self.data_test: Optional[Dataset] = None

    def prepare_data(self):
        """Download data if needed."""
        # WikiTextDataset(self.data_dir, )
        # WikiTextDataset(self.data_dir, )

    def setup(self, stage: Optional[str] = None):
        """Load data. Set variables: self.data_train, self.data_val, self.data_test."""
        self.vocab = generate_vocabulary(data_dir=self.data_dir)
        self.data_train = WikiTextDataset(
            self.data_dir, split="train", vocab=self.vocab, seq_len=self.seq_len
        )
        self.data_val = WikiTextDataset(
            self.data_dir, split="valid", vocab=self.vocab, seq_len=self.seq_len
        )
        self.data_test = WikiTextDataset(
            self.data_dir, split="test", vocab=self.vocab, seq_len=self.seq_len
        )

    def train_dataloader(self):
        return DataLoader(
            dataset=self.data_train,
            batch_size=self.batch_size,
            num_workers=self.num_workers,
            pin_memory=self.pin_memory,
            drop_last=self.drop_last,
            shuffle=True,
        )

    def val_dataloader(self):
        return DataLoader(
            dataset=self.data_val,
            batch_size=self.batch_size,
            num_workers=self.num_workers,
            pin_memory=self.pin_memory,
            drop_last=self.drop_last,
            shuffle=False,
        )

    def test_dataloader(self):
        return DataLoader(
            dataset=self.data_test,
            batch_size=self.batch_size,
            num_workers=self.num_workers,
            pin_memory=self.pin_memory,
            drop_last=self.drop_last,
            shuffle=False,
        )


In [183]:
datamodule = WikiTextDataModule(seq_len=100, batch_size=1)
datamodule.setup()

In [192]:
for x, y in datamodule.train_dataloader():
    print(x.shape)
    sentence1 = [datamodule.vocab.itos[int(number)] for number in torch.flatten(x)]
    print(' '.join(sentence1))
    
    print()
    
    sentence2 = [datamodule.vocab.itos[int(number)] for number in torch.flatten(y)]
    print(' '.join(sentence2))
    break

torch.Size([1, 100])
state that the habit of double spacing is too deeply <unk> to change . others claim that additional space between sentences improves the aesthetics or readability of text . proponents of double sentence spacing also state that some publishers may still require double @-@ spaced manuscript submissions from authors . a key example noted is the screenwriting industry ' s monospaced standard for screenplay manuscripts , courier , 12 @-@ point font , although some works on screenwriting indicate that courier is merely preferred – proportional fonts may be used . some reliable sources state simply that writers should follow

their particular style guide , but proponents of double spacing caution that publishers ' guidance takes precedence , including those that ask for double sentence spaced manuscripts . one of the most popular arguments against wider sentence spacing is that it was created for monospaced fonts of the typewriter , and is no longer needed with modern pro

In [151]:
vocab = datamodule.vocab.freqs
print(datamodule.vocab.freqs["the"])

sentence = 

print(list(vocab.keys())[list(vocab.values()).index(17)])

130768
divorced


In [162]:
data_dir = "data/"
raw_text_iter = ["abc", "def", "gh", "gh", "def"]
tokenizer = get_tokenizer("basic_english")
counter = Counter()
for x in raw_text_iter:
    counter.update(tokenizer(x))
print(counter)
vocab = Vocab(counter)
print(vocab["def"])
print(vocab["gh"])
print(vocab["abc"])

Counter({'def': 2, 'gh': 2, 'abc': 1})
2
3
4


torch.Size([64, 30])
tensor([   88,     3,    55,  2541,     3,   129,   149,    12,  1497,  7087,
            3,  1487,     3, 10371,     6, 23132,     3, 17746,     4,    67,
         1609,     2,    79,     3,  1280,    11,  3016,     9,   289,  2204])
['city', ',', 'new', 'jersey', ',', 'united', 'states', "'", 'entertainment', 'resort', ',', 'hotel', ',', 'casino', 'and', 'spa', ',', 'revel', '.', 'while', 'singing', 'the', 'song', ',', 'beyoncé', 'was', 'wearing', 'a', 'black', 'dress']


In [120]:
def generate_square_mask(size):
    mask = torch.triu(torch.ones(size, size))
    mask = torch.flip(mask, dims=(-1,))
    return mask

In [121]:
generate_square_mask(10)

tensor([[1., 1., 1., 1., 1., 1., 1., 1., 1., 1.],
        [1., 1., 1., 1., 1., 1., 1., 1., 1., 0.],
        [1., 1., 1., 1., 1., 1., 1., 1., 0., 0.],
        [1., 1., 1., 1., 1., 1., 1., 0., 0., 0.],
        [1., 1., 1., 1., 1., 1., 0., 0., 0., 0.],
        [1., 1., 1., 1., 1., 0., 0., 0., 0., 0.],
        [1., 1., 1., 1., 0., 0., 0., 0., 0., 0.],
        [1., 1., 1., 0., 0., 0., 0., 0., 0., 0.],
        [1., 1., 0., 0., 0., 0., 0., 0., 0., 0.],
        [1., 0., 0., 0., 0., 0., 0., 0., 0., 0.]])

tensor([[1., 1., 1., 1., 1.],
        [1., 1., 1., 1., 1.],
        [1., 1., 1., 1., 1.],
        [1., 1., 1., 1., 1.],
        [1., 1., 1., 1., 1.]])

In [2]:
from datasets import list_datasets, load_dataset, list_metrics, load_metric
datasets_list = list_datasets()
len(datasets_list)

886

In [7]:
dataset = load_dataset("wikitext", "wikitext-2-raw-v1")

Reusing dataset wikitext (/home/ash/.cache/huggingface/datasets/wikitext/wikitext-2-raw-v1/1.0.0/aa5e094000ec7afeb74c3be92c88313cd6f132d564c7effd961c10fd47c76f20)


In [11]:
dataset["test"]

Dataset({
    features: ['text'],
    num_rows: 4358
})

In [25]:
haha = iter(dataset["test"])
print(next(haha))
print(next(haha))
print(next(haha))
print(next(haha))
print(next(haha))
print(next(haha))

{'text': ''}
{'text': ' = Robert Boulter = \n'}
{'text': ''}
{'text': ' Robert Boulter is an English film , television and theatre actor . He had a guest @-@ starring role on the television series The Bill in 2000 . This was followed by a starring role in the play Herons written by Simon Stephens , which was performed in 2001 at the Royal Court Theatre . He had a guest role in the television series Judge John Deed in 2002 . In 2004 Boulter landed a role as " Craig " in the episode " Teddy \'s Story " of the television series The Long Firm ; he starred alongside actors Mark Strong and Derek Jacobi . He was cast in the 2005 theatre productions of the Philip Ridley play Mercury Fur , which was performed at the Drum Theatre in Plymouth and the Menier Chocolate Factory in London . He was directed by John Tiffany and starred alongside Ben Whishaw , Shane Zaza , Harry Kent , Fraser Ayres , Sophie Stanton and Dominic Hall . \n'}
{'text': ' In 2006 , Boulter starred alongside Whishaw in the pla

In [30]:
print(dataset.shape)
print(dataset.num_columns)
print(dataset.num_rows)
print(dataset.column_names)

{'test': (4358, 1), 'train': (36718, 1), 'validation': (3760, 1)}
{'test': 1, 'train': 1, 'validation': 1}
{'test': 4358, 'train': 36718, 'validation': 3760}
{'test': ['text'], 'train': ['text'], 'validation': ['text']}


In [32]:
dataset.description

AttributeError: 'DatasetDict' object has no attribute 'description'

In [59]:
# dataset["test"]["text"][:300]

abc = [3, 2 ,5 ,2 ,5, 2, 5]
abc[:50]

[3, 2, 5, 2, 5, 2, 5]