# 语言模型数据集(周杰伦专辑歌词)

In [1]:
import torch
import random
import zipfile
import numpy as np

## 读取数据集

In [2]:
with zipfile.ZipFile('../Datasets/jaychou_lyrics/jaychou_lyrics.txt.zip') as zin:
    with zin.open('jaychou_lyrics.txt') as f:
        corpus_chars = f.read().decode('utf-8')  # corpus

In [3]:
print(len(corpus_chars))
corpus_chars[:40]

63282


'想要有直升机\n想要和你飞到宇宙去\n想要和你融化在一起\n融化在宇宙里\n我每天每天每'

In [4]:
# corpus process
# replace '\n' or '\t' with ' '
corpus_chars = corpus_chars.replace('\n', ' ').replace('\t', ' ')
corpus_chars = corpus_chars[: 20000]

In [5]:
print(len(corpus_chars))
corpus_chars[:40]

20000


'想要有直升机 想要和你飞到宇宙去 想要和你融化在一起 融化在宇宙里 我每天每天每'

In [6]:
# vocab count
def vocab_count(text, max_vocab=None):
    
    vocab = set(corpus_chars)
    vocab_counts = {}

    # initial vocab_count
    for word in corpus_chars:
        if vocab_counts.get(word) is None:
            vocab_counts[word] = 1
        else:
            vocab_counts[word] += 1

    vocab_count_list = []
    for word, count in vocab_counts.items():

        vocab_count_list.append((word, count))

    # sort according to word count from large to small
    vocab_count_list.sort(key=lambda x: x[1], reverse=True) 
     
    if max_vocab is not None and len(vocab_count_list) > max_vocab:
        vocab_count_list = vocab_count_list[:max_vocab]
        
    vocab = [x[0] for x in vocab_count_list]
    
    return vocab

In [7]:
vocab = vocab_count(corpus_chars)
char_to_index = {c:i for i, c in enumerate(vocab)}
index_to_char = dict(enumerate(vocab))
vocab_size = len(vocab)

In [8]:
def word_to_index(word):
    if word in char_to_index:
        return char_to_index[word]
    else:
        return len(vocab)

In [9]:
def index_to_word(index):
    if index == len(vocab):
        return '<unk>'
    elif index < len(vocab):
        return index_to_char[index]
    else:
        raise Exception('Unknown index!')


In [10]:
# convert text to array
corpus_indices = [word_to_index(word) for word in corpus_chars]

print(corpus_indices[:40])
print(''.join([index_to_word(index) for index in corpus_indices[:40]]))

[8, 17, 7, 79, 950, 591, 0, 8, 17, 111, 3, 208, 14, 679, 680, 32, 0, 8, 17, 111, 3, 681, 165, 6, 5, 67, 0, 681, 165, 6, 679, 680, 58, 0, 2, 135, 15, 135, 15, 135]
想要有直升机 想要和你飞到宇宙去 想要和你融化在一起 融化在宇宙里 我每天每天每


## 时序数据采样

### 随机采样

在随机采样中，每个样本是原始序列上任意截取的一段序列。相邻的两个随机小批量在原始序列上的位置不一定相毗邻。因此，我们无法用一个小批量最终时间步的隐藏状态来初始化下一个小批量的隐藏状态。

**在训练模型时，每次随机采样前都需要重新初始化隐藏状态。**

In [11]:
def data_loader_random(corpus_indices, seq_length, time_steps, device=None):
    num_samples = (len(corpus_indices) - 1) // time_steps
    epoch_size = num_samples // seq_length
    
    # shuffle sample
    samples_indices = list(range(num_samples))
    random.shuffle(samples_indices)
    
    # generator data
    for i in range(epoch_size):
        
        i = i * seq_length
        
        batch_incices = samples_indices[i: i+seq_length]
        x = [corpus_indices[indices: indices+time_steps] for indices in batch_incices]
        y = [corpus_indices[indices+1: indices+time_steps+1] for indices in batch_incices]
        
        x = torch.tensor(x, dtype=torch.float32).view(seq_length, time_steps)
        y = torch.tensor(y, dtype=torch.float32).view(seq_length, time_steps)
        
        yield x, y      

In [12]:
demo_seq = list(range(30))
for x, y in data_loader_random(demo_seq, seq_length=2, time_steps=6):
   print(x , '\n', y)

tensor([[0., 1., 2., 3., 4., 5.],
        [1., 2., 3., 4., 5., 6.]]) 
 tensor([[1., 2., 3., 4., 5., 6.],
        [2., 3., 4., 5., 6., 7.]])
tensor([[2., 3., 4., 5., 6., 7.],
        [3., 4., 5., 6., 7., 8.]]) 
 tensor([[3., 4., 5., 6., 7., 8.],
        [4., 5., 6., 7., 8., 9.]])


### 相邻采样

**相邻连个随机小批量在原始序列上的位置相毗邻**

这时，我们可以用一个小批量的最终时间步的隐藏状态初始化下一个小批量的隐藏状态， 从而使得下一个小批量的输出也取决于当前小批量的输入，如此循环下去。

这对实现循环神经网络造成了两方面影响：
* 一方面， 在训练模型时，我们只需在每一个迭代周期开始时初始化隐藏状态；
* 另一方面，当多个相邻小批量通过传递隐藏状态串联起来时，模型参数的梯度计算将依赖所有串联起来的小批量序列。

** 为了使模型参数的梯度计算只依赖一次迭代读取的小批量序列，我们可以在每次读取小批量前将隐藏状态从计算图中分离出来**

In [13]:
def data_loader_consecutive(corpus_indices, seq_length, time_steps):

    data_len = len(corpus_indices)

    seq_size = data_len // seq_length

    # resize to => (seq_length, seq_size)
    corpus_indices = np.array(corpus_indices[: seq_size * seq_length], dtype=np.float).reshape((seq_length, -1))

    epoch_size = (seq_size - 1) // time_steps

    # generator data
    np.random.shuffle(corpus_indices)

    # convert to torch tensor
    torch_indices = torch.tensor(corpus_indices, dtype=torch.float32).view(seq_length, seq_size)
    for i in range(epoch_size):
        i = i * time_steps
        x = torch_indices[:, i: i + time_steps]
        y = torch_indices[:, i + 1: i + time_steps + 1]

        yield x, y

In [14]:
for x, y in data_loader_consecutive(demo_seq, seq_length=2, time_steps=6):
   print(x , '\n', y)

tensor([[ 0.,  1.,  2.,  3.,  4.,  5.],
        [15., 16., 17., 18., 19., 20.]]) 
 tensor([[ 1.,  2.,  3.,  4.,  5.,  6.],
        [16., 17., 18., 19., 20., 21.]])
tensor([[ 6.,  7.,  8.,  9., 10., 11.],
        [21., 22., 23., 24., 25., 26.]]) 
 tensor([[ 7.,  8.,  9., 10., 11., 12.],
        [22., 23., 24., 25., 26., 27.]])
