In [1]:
import torch
import numpy as np
from torch import nn
from torch.nn import functional as F
from d2l import torch as d2l
import re
from collections import Counter
import rnn_utils2

In [7]:
corpus, vocab = rnn_utils2.load_corpus_time_machine()

In [27]:

corpus, vocab = load_corpus_time_machine()
len(corpus), len(vocab)

(170108, 28)

In [17]:
def seq_data_iter_random(corpus, batch_size, num_steps):
    """Generate a minibatch of subsequences using random sampling."""
    # Start with a random offset (inclusive of `num_steps - 1`) to partition a sequence
    corpus = corpus[np.random.randint(0, num_steps - 1):]
    print(f"length of corpus = {len(corpus)}")
    # Subtract 1 since we need to account for labels
    num_subseqs = (len(corpus) - 1) // num_steps
    print(f"num_subseqs = {num_subseqs}")
    # The starting indices for subsequences of length `num_steps`
    initial_indices = list(range(0, num_subseqs * num_steps, num_steps))
    # In random sampling, the subsequences from two adjacent random
    # minibatches during iteration are not necessarily adjacent on the
    # original sequence
    np.random.shuffle(initial_indices)
    print(initial_indices)

    def data(pos):
        # Return a sequence of length `num_steps` starting from `pos`
        return corpus[pos:pos + num_steps]

    num_batches = num_subseqs // batch_size
    print(f"num_batches = {num_batches}")
    for i in range(0, batch_size * num_batches, batch_size):
        # Here, `initial_indices` contains randomized starting indices for subsequences
        print(f"i = {i}, batch_size = {batch_size}")
        initial_indices_per_batch = initial_indices[i:i + batch_size]
        print(initial_indices_per_batch)
        X = [data(j) for j in initial_indices_per_batch]
        Y = [data(j + 1) for j in initial_indices_per_batch]
        yield torch.tensor(X), torch.tensor(Y)

In [22]:
def seq_data_iter_sequential(corpus, batch_size, num_steps):  #@save
    """Generate a minibatch of subsequences using sequential partitioning."""
    # Start with a random offset to partition a sequence
    offset = np.random.randint(0, num_steps)
    print(f"offset = {offset}")
    num_tokens = ((len(corpus) - offset - 1) // batch_size) * batch_size
    print(f"num_tokens = {num_tokens}")
    Xs = torch.tensor(corpus[offset:offset + num_tokens])
    Ys = torch.tensor(corpus[offset + 1:offset + 1 + num_tokens])
    Xs, Ys = Xs.reshape(batch_size, -1), Ys.reshape(batch_size, -1)
    print(f"Xs.shape = {Xs.shape}")
    num_batches = Xs.shape[1] // num_steps
    print(f"num_batches = {num_batches}")
    for i in range(0, num_steps * num_batches, num_steps):
        X = Xs[:, i:i + num_steps]
        Y = Ys[:, i:i + num_steps]
        yield X, Y

In [23]:
my_seq = list(range(35))
for X, Y in seq_data_iter_sequential(my_seq, batch_size=2, num_steps=5):
    print('X: ', X, '\nY:', Y)

offset = 0
num_tokens = 34
Xs.shape = torch.Size([2, 17])
num_batches = 3
X:  tensor([[ 0,  1,  2,  3,  4],
        [17, 18, 19, 20, 21]]) 
Y: tensor([[ 1,  2,  3,  4,  5],
        [18, 19, 20, 21, 22]])
X:  tensor([[ 5,  6,  7,  8,  9],
        [22, 23, 24, 25, 26]]) 
Y: tensor([[ 6,  7,  8,  9, 10],
        [23, 24, 25, 26, 27]])
X:  tensor([[10, 11, 12, 13, 14],
        [27, 28, 29, 30, 31]]) 
Y: tensor([[11, 12, 13, 14, 15],
        [28, 29, 30, 31, 32]])
