In [189]:
from __future__ import print_function

from keras.preprocessing import sequence
from keras.models import Sequential
from keras.layers import Dense, Embedding
from keras.layers import LSTM
from keras.datasets import imdb
from keras.layers import RepeatVector
from keras.layers import TimeDistributed
from keras.layers import Activation

import collections
import os
import sys
import jieba
import pickle
import numpy as np

In [190]:
#max_features = 20000
#maxlen = 80
batch_size = 20
learning_rate = 0.01
file_path = './dataset/data/周杰伦歌词大全_cleaned.txt'
checkpoints_dir = './checkpoints/lyrics'
model_prefix = 'lyrics'
epochs = 200
start_token = 'G'
end_token = 'E'


In [191]:
def process_lyrics(file_path):
    lyrics = []
    #content = clean_cn_corpus(file_name, clean_level='all', is_save=False)
    fr = open(file_path)
    
    for l in fr.readlines():
        if len(l) < 80:
            continue
        l = start_token + l.strip() + end_token
        lyrics.append(l)
    lyrics = sorted(lyrics, key=lambda line: len(line))
    #print(lyrics)
    print('all %d songs...' % len(lyrics))

    all_words = []
    for lyric in lyrics:
        all_words += jieba.lcut(lyric, cut_all=False)

    # calculate how many time appear per word
    counter = collections.Counter(all_words)
    print(counter['E'])
    # sorted depends on frequent
    counter_pairs = sorted(counter.items(), key=lambda x: -x[1])
    words, _ = zip(*counter_pairs)
    #print(words)
    print('E' in words)

    #words = words[:len(words)] + (' ',)
    word_int_map = dict(zip(words, range(len(words))))
    lyrics_vector = []
    # translate all lyrics into int vector
    for lyric in lyrics:
        word = jieba.lcut(lyric, cut_all=False)
        lyrics_vector.append([word_int_map[w] for w in word])
        
    #lyrics_vector = [list(map(lambda word: word_int_map.get(word, len(words)), lyric)) for lyric in lyrics]
    #print(lyrics_vector)
    return lyrics_vector, word_int_map, words

In [192]:
def generate_batch(batch_size, lyrics_vec, word_to_int):
    # split all lyrics into n_chunks * batch_size
    n_chunk = len(lyrics_vec) // batch_size
    
    x_batches = []
    y_batches = []
    for i in range(n_chunk):
        start_index = i * batch_size
        end_index = start_index + batch_size

        batches = lyrics_vec[start_index:end_index]
        # very batches length depends on the longest lyric
        length = max(map(len, batches))
        # 填充一个这么大小的空batch，空的地方放空格对应的index标号
        x_data = np.full((batch_size, length), word_to_int[' '], np.int32)
        for row in range(batch_size):
            x_data[row, :len(batches[row])] = batches[row]
        y_data = np.copy(x_data)
        # y的话就是x向左边也就是前面移动一个
        y_data[:, :-1] = x_data[:, 1:]
        """
        x_data             y_data
        [6,2,4,6,9]       [2,4,6,9,9]
        [1,4,2,8,5]       [4,2,8,5,5]
        """
        x_batches.append(x_data)
        y_batches.append(y_data)
    return x_batches, y_batches

In [193]:
def generate_data(lyrics_vec, word_to_int):
    # split all lyrics into n_chunks * batch_size
    n_cnt = len(lyrics_vec)
    x_batches = []
    y_batches = []
    for i in range(n_cnt):
        batches = lyrics_vec[i]
        # very batches length depends on the longest lyric
        length = len(batches)
        # 填充一个这么大小的空batch，空的地方放空格对应的index标号
        x_data = np.full((length), word_to_int[' '], np.int32)
        x_data[:len(batches)] = batches
        y_data = np.copy(x_data)
        # y的话就是x向左边也就是前面移动一个
        y_data[:-1] = x_data[1:]
        """
        x_data             y_data
        [6,2,4,6,9]       [2,4,6,9,9]
        [1,4,2,8,5]       [4,2,8,5,5]
        """
        x_batches.append(x_data)
        y_batches.append(y_data)
    return x_batches, y_batches

In [194]:
if not os.path.exists(os.path.dirname(checkpoints_dir)):
    os.mkdir(os.path.dirname(checkpoints_dir))
if not os.path.exists(checkpoints_dir):
    os.mkdir(checkpoints_dir)

poems_vector, word_to_int, vocabularies = process_lyrics(file_path)
vocab_size = len(vocabularies)
#batches_inputs, batches_outputs = generate_batch(batch_size, poems_vector, word_to_int)
bi,bo = generate_data(poems_vector, word_to_int)

all 131 songs...
131
True


In [195]:
mlen = max([len(b) for b in bi])
bis=sequence.pad_sequences(bi, padding='post',maxlen=mlen,value=word_to_int[end_token])
bos=sequence.pad_sequences(bo, padding='post',maxlen=mlen,value=word_to_int[end_token])

In [204]:
print('Build model...')
model = Sequential()
model.add(Embedding(vocab_size+1, 128))
#model.add(LSTM(128, dropout=0.2, recurrent_dropout=0.2,return_sequences=True))
model.add(LSTM(128, dropout=0.2, recurrent_dropout=0.2))
model.add(RepeatVector(mlen))
model.add(LSTM(128, return_sequences=True))
model.add(TimeDistributed(Dense(vocab_size)))
model.add(Activation('softmax'))

model.compile(loss='categorical_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])

print(model.summary())

Build model...
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_12 (Embedding)     (None, None, 128)         733056    
_________________________________________________________________
lstm_20 (LSTM)               (None, 128)               131584    
_________________________________________________________________
repeat_vector_8 (RepeatVecto (None, 481, 128)          0         
_________________________________________________________________
lstm_21 (LSTM)               (None, 481, 128)          131584    
_________________________________________________________________
time_distributed_4 (TimeDist (None, 481, 5726)         738654    
_________________________________________________________________
activation_3 (Activation)    (None, 481, 5726)         0         
Total params: 1,734,878.0
Trainable params: 1,734,878.0
Non-trainable params: 0.0
_____________________________________________

In [184]:
from keras.utils import to_categorical
yc=to_categorical(bos,vocab_size)
yd=yc.reshape(bos.shape[0],bos.shape[1],-1)

In [197]:
for iteration in range(1, 5):
    print()
    print('-' * 50)
    print('Iteration', iteration)
    model.fit(bis,yd, batch_size=batch_size, epochs=7)


--------------------------------------------------
Iteration 1
Epoch 1/1

--------------------------------------------------
Iteration 2
Epoch 1/1

--------------------------------------------------
Iteration 3
Epoch 1/1

--------------------------------------------------
Iteration 4
Epoch 1/1


In [202]:
x = np.array([list(map(word_to_int.get, start_token))])
print(x)
y=model.predict(x)
print(y[0].shape)

[[14]]
(481, 5726)
