In [2]:
import torch
import random
import zipfile

with zipfile.ZipFile("../data/jaychou_lyrics.txt.zip")as zin:
    with zin.open("jaychou_lyrics.txt") as f:
        corpus_chars = f.read().decode("utf-8")

corpus_chars[:40]

'想要有直升机\n想要和你飞到宇宙去\n想要和你融化在一起\n融化在宇宙里\n我每天每天每'

In [6]:
## 周杰伦歌词数据集
# 这个数据集有６万多个字符，为了打印方便，将换行符换成空格
corpus_chars = corpus_chars.replace("\n"," ").replace("\r", " ")
# corpus_chars = corpus_chars[0:10000]

## 建立字符索引
将每个字符映射为一个从０开始的整数，称为索引，方便后续的数据处理。　为了得到索引，将数据集中的所有不同字符取出来，然后将其逐一映射构建字典。

In [7]:
idx_to_char = list(set(corpus_chars))
char_to_idx = dict([(char,i) for i,char in enumerate(idx_to_char)])
vocab_size = len(char_to_idx)
vocab_size

1027

In [8]:
# 将数据集中的字符转换为索引
corpus_indices = [char_to_idx[char] for char in corpus_chars]
sample = corpus_indices[:20]
print("chars:"," ".join([idx_to_char[idx] for idx in sample]))
print("indices:",sample)

chars: 想 要 有 直 升 机   想 要 和 你 飞 到 宇 宙 去   想 要 和
indices: [356, 466, 720, 696, 248, 184, 219, 356, 466, 227, 505, 199, 973, 88, 1002, 509, 219, 356, 466, 227]


In [9]:
def data_iter_random(corpus_indices,batch_size,num_steps,device=None):
    # -1是因为输出的索引ｘ市相应的输入的索引ｙ+1
    num_examples = (len(corpus_indices) - 1) // num_steps
    epoch_size = num_examples // batch_size

    examples_indices = list(range(num_examples))
    random.shuffle(examples_indices)

    def _data(pos):
        return corpus_indices[pos:pos + num_steps]
    
    if device is None:
        device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    
    for i in range(epoch_size):
        i = i * batch_size
        batch_indices = examples_indices[i: i + batch_size]
        x = [_data(j * num_steps) for j in batch_indices]
        y = [_data(j * num_steps + 1) for j in batch_indices]
        yield torch.tensor(x,dtype=torch.float32,device=device),torch.tensor(y,dtype=torch.float32,device=device)

In [11]:
my_seq = list(range(30))
# batch_size = 2,每次取２个样本
# steps = 6，每个样本是由６个样本组成的片段
for x,y in data_iter_random(my_seq,batch_size=2,num_steps=6): 
    print("x:",x)
    print("y:",y)

x: tensor([[18., 19., 20., 21., 22., 23.],
        [ 0.,  1.,  2.,  3.,  4.,  5.]], device='cuda:0')
y: tensor([[19., 20., 21., 22., 23., 24.],
        [ 1.,  2.,  3.,  4.,  5.,  6.]], device='cuda:0')
x: tensor([[ 6.,  7.,  8.,  9., 10., 11.],
        [12., 13., 14., 15., 16., 17.]], device='cuda:0')
y: tensor([[ 7.,  8.,  9., 10., 11., 12.],
        [13., 14., 15., 16., 17., 18.]], device='cuda:0')


In [12]:
def data_iter_consecutive(corpus_indices,batch_size,num_stpes,device=None):
    if device is None:
        device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    
    corpus_indices = torch.tensor(corpus_indices,dtype=torch.float32,device=device)
    data_len = len(corpus_indices)
    batch_len = data_len // batch_size
    
    indices = corpus_indices[0:batch_size * batch_len].view(batch_size,batch_len)
    
    epoch_size = (batch_len - 1) // num_stpes

    for i in range(epoch_size):
        i = i * num_stpes
        x = indices[:,i : i + num_stpes]
        y = indices[:,i + 1 : i + num_stpes + 1]
        yield x,y

In [13]:
for x,y in data_iter_consecutive(my_seq,batch_size=2,num_stpes=6):
    print("x:",x)
    print("y:",y)

x: tensor([[ 0.,  1.,  2.,  3.,  4.,  5.],
        [15., 16., 17., 18., 19., 20.]], device='cuda:0')
y: tensor([[ 1.,  2.,  3.,  4.,  5.,  6.],
        [16., 17., 18., 19., 20., 21.]], device='cuda:0')
x: tensor([[ 6.,  7.,  8.,  9., 10., 11.],
        [21., 22., 23., 24., 25., 26.]], device='cuda:0')
y: tensor([[ 7.,  8.,  9., 10., 11., 12.],
        [22., 23., 24., 25., 26., 27.]], device='cuda:0')


In [1]:
import d2lzh as d2l

corpus_indices,char_to_idx,idx_to_char,vocab_size = d2l.load_data_jay_lyrics()

print(corpus_indices[:40])
print(vocab_size)

[1687, 2052, 1784, 372, 1111, 843, 48, 1687, 2052, 1359, 29, 423, 533, 2283, 2090, 1349, 48, 1687, 2052, 1359, 29, 102, 1421, 2437, 1275, 1826, 48, 102, 1421, 2437, 2283, 2090, 411, 48, 78, 1346, 1128, 1346, 1128, 1346]
2582


In [3]:
print(idx_to_char[corpus_indices[0]])

想
