Data: https://drive.google.com/file/d/1HneKy22aVGKYlSC8p4Cttd6-DKdFdY83/view?usp=sharing

Tutorial: https://github.com/jiuzhangjiangzuo/AICamp1.NLP

In [1]:
import numpy as np
from tqdm import tqdm
import warnings
warnings.filterwarnings('ignore')

In [2]:
TRAIN_DIR = '/Users/songyihe/Documents/Study/AI_Projects/large-datasets/english-chinese/segmented_train_seg_by_word.txt'

# Load Data

In [3]:
eng_lines, ch_lines = [], []
with open(TRAIN_DIR) as file:
    num_line = 0
    for line in tqdm(file):
        num_line += 1
        if num_line % 2 == 1:
            eng_line = [i.lower() for i in line.strip('\n').split()]
            continue
        else:
            ch_line = [i for i in line.strip('\n').replace(' ', '')]
            
        if len(eng_line) <= 5 and len(ch_line) <= 5:
            eng_lines.append(eng_line)
            ch_lines.append(ch_line)

20000000it [01:02, 318596.10it/s]


In [4]:
print(f'English Lines: {len(eng_lines)}')
print(f'Chinese Lines: {len(ch_lines)}')

English Lines: 103912
Chinese Lines: 103912


In [5]:
print(eng_lines[0])
print(ch_lines[0])

['deuces', 'the', 'winner', '.']
['一', '对', '二', '胜', '。']


# Build Vocab

In [6]:
from collections import Counter

In [7]:
eng_vocab, ch_vocab = [], []

for sentence in eng_lines:
    for word in sentence:
        eng_vocab.append(word)
        
for sentence in ch_lines:
    for word in sentence:
        ch_vocab.append(word)

In [8]:
print(f'Most Common English Word: {Counter(eng_vocab).most_common(10)}')
print(f'Most Common Chinese Word: {Counter(ch_vocab).most_common(10)}')

Most Common English Word: [('.', 49461), ('?', 14511), ('the', 10472), ('i', 10013), (',', 9489), ('!', 8807), ('you', 7493), ('a', 6860), ("'", 5555), ('it', 5524)]
Most Common Chinese Word: [('。', 44900), ('我', 15087), ('？', 14454), ('你', 9087), ('！', 8915), ('了', 8663), ('的', 8053), ('，', 7291), ('一', 6091), ('是', 5946)]


In [9]:
def add_word_to_dict(word2idx, idx2word, word):
    if word in word2idx:
        return
    index = len(word2idx)
    word2idx[word] = index
    idx2word[index] = word

In [10]:
spec_words = ['<eos>','<start>','<end>','<unk>']
eng_word2idx, eng_idx2word, ch_word2idx, ch_idx2word = {}, {}, {}, {}

for word in spec_words:
    add_word_to_dict(eng_word2idx, eng_idx2word, word)
    add_word_to_dict(ch_word2idx, ch_idx2word, word)
    
for word in set(eng_vocab):
    add_word_to_dict(eng_word2idx, eng_idx2word, word)
    
for word in set(ch_vocab):
    add_word_to_dict(ch_word2idx, ch_idx2word, word)

In [11]:
print(f'Size of English Dict: {len(eng_word2idx)}')
print(f'Size of Chinese Dict: {len(ch_word2idx)}')

Size of English Dict: 21743
Size of Chinese Dict: 4058


# Prepare Data

In [43]:
data_x_in, data_y_in, data_y_out = [], [], []
data_x_len, data_y_len = [], []

for eng_line in eng_lines:
    indices = [eng_word2idx.get(word, eng_word2idx['<unk>'])for word in eng_line] + [eng_word2idx['<eos>']]
    data_x_in.append(indices)
    data_x_len.append(len(indices))
    
for ch_line in ch_lines:
    indices = [ch_word2idx.get(word, ch_word2idx['<unk>'])for word in ch_line]
    data_y_in.append([ch_word2idx['<start>']] + indices)
    data_y_out.append(indices + [ch_word2idx['<eos>']])
    data_y_len.append(len(indices) + 1)

In [44]:
print([eng_idx2word[idx] for idx in data_x_in[0]])
print([ch_idx2word[idx] for idx in data_y_in[0]])
print([ch_idx2word[idx] for idx in data_y_out[0]])

['deuces', 'the', 'winner', '.', '<eos>']
['<start>', '一', '对', '二', '胜', '。']
['一', '对', '二', '胜', '。', '<eos>']


In [45]:
import tensorflow.compat.v1 as tf

In [47]:
data_x_in_pad = tf.keras.preprocessing.sequence.pad_sequences(data_x_in, padding='post', value=eng_word2idx['<eos>'])
data_y_in_pad = tf.keras.preprocessing.sequence.pad_sequences(data_y_in, padding='post', value=eng_word2idx['<end>'])
data_y_out_pad = tf.keras.preprocessing.sequence.pad_sequences(data_y_out, padding='post', value=eng_word2idx['<end>'])

data_x_len = np.asarray(data_x_len)
data_y_len = np.asarray(data_y_len)

In [48]:
print(f'data_x_in_pad shape: {data_x_in_pad.shape}')
print(f'data_y_in_pad shape: {data_y_in_pad.shape}')
print(f'data_y_out_pad shape: {data_y_out_pad.shape}')
print(f'data_x_len shape: {data_x_len.shape}')
print(f'data_y_len shape: {data_y_len.shape}')

data_x_in_pad shape: (103912, 6)
data_y_in_pad shape: (103912, 6)
data_y_out_pad shape: (103912, 6)
data_x_len shape: (103912,)
data_y_len shape: (103912,)


# Model

In [49]:
import tensorflow.compat.v1 as tf
import tensorflow_addons as tfa
from tensorflow.python.layers import core as layers_core

In [50]:
tf.reset_default_graph()
tf.disable_eager_execution()
config = tf.ConfigProto(log_device_placement=True,allow_soft_placement = True)
session = tf.Session(config=config)

Device mapping: no known devices.


In [51]:
embedding_size = 512   #word embedding vector length
num_units = 512        #hidden layer size
batch_size = 128
max_grad = 1.0         #L2裁剪的c
dropout = 0.2
src_vocab_size = len(eng_word2idx)
target_vocat_size = len(ch_word2idx)
seq_max_len = 5 + 1
maximum_iterations = 10  #预测长度超过10时终结

In [52]:
with tf.device('/cpu:0'):
    initializer = tf.random_uniform_initializer(-0.08, 0.08)
    tf.get_variable_scope().set_initializer(initializer)
    
    x = tf.placeholder("int32", [None, None])
    y = tf.placeholder("int32", [None, None])
    y_in = tf.placeholder("int32",[None,None]) #decoder的输入
    
    x_len = tf.placeholder("int32",[None])
    y_len = tf.placeholder("int32",[None])
    learning_rate = tf.placeholder(tf.float32, shape=[])
    
    # embedding
    embedding_encoder = tf.get_variable(
        "embedding_encoder", [src_vocab_size, embedding_size],dtype=tf.float32)
    embedding_decoder = tf.get_variable(
        "embedding_decoder", [target_vocat_size, embedding_size],dtype=tf.float32)
    
    encoder_emb_inp = tf.nn.embedding_lookup(
        embedding_encoder, x)
    decoder_emb_inp = tf.nn.embedding_lookup(
        embedding_decoder, y_in)
    
    # ====== Encoder 与上节课一致 =======
    
    # Build RNN cell
    encoder_cell = tf.nn.rnn_cell.BasicLSTMCell(num_units)

    # Run Dynamic RNN
    #   encoder_outputs: [max_time, batch_size, num_units]
    #   encoder_state: [batch_size, num_units]
    encoder_outputs, encoder_state = tf.nn.dynamic_rnn(
        encoder_cell, encoder_emb_inp,
        sequence_length=x_len, time_major=False,dtype=tf.float32)
        
    # ====== decoder 在training和predict有两套结构，在tensorflow中使用variable reuse来解决 =======
    # Variable reuse: 用with statement构建特定scope下的代码块，如果scope string相同，variable可以设置为复用
    
    batch_size_in = tf.shape(x)[0]
    projection_layer = layers_core.Dense(
        len(ch_word2idx), use_bias=False)
    
    # Dynamic decoding
    with tf.variable_scope("decode_layer"):
        # Build RNN cell
        decoder_cell = tf.nn.rnn_cell.BasicLSTMCell(num_units)
        
        # 在tensorflow中要实现ppt图示的training结构，需要用到training sampler和dynamic decode两个结构
        
        # Sampler
        sampler = tfa.seq2seq.TrainingSampler(time_major=False) #y_len告诉需要解码多少个时间片
        sampler.initialize(decoder_emb_inp, y_len)
        # Decoder
        decoder = tfa.seq2seq.BasicDecoder(decoder_cell, sampler, output_layer=projection_layer)
        # Dynamic decoding
        outputs, _, _ = tfa.seq2seq.dynamic_decode(
            decoder,
            decoder_init_input = decoder_emb_inp,
            decoder_init_kwargs= {
                'initial_state' : encoder_state,
                'sequence_length': y_len
            }
        )
        logits = outputs.rnn_output
        target_weights = tf.sequence_mask( #把不必要的softmax loss给mask掉
            y_len, seq_max_len, dtype=logits.dtype)
    
    # predicting
    # Sampler 该用greedy embedding sampler
    with tf.variable_scope("decode_layer", reuse=True):
        # Sampler
        sampler = tfa.seq2seq.GreedyEmbeddingSampler()
        # Decoder
        decoder = tfa.seq2seq.BasicDecoder(decoder_cell, sampler, output_layer=projection_layer)       
        # Dynamic decoding
        outputs, _ , __= tfa.seq2seq.dynamic_decode(
            decoder, maximum_iterations=maximum_iterations, 
            decoder_init_input = embedding_decoder,
            decoder_init_kwargs= {
                'initial_state' : encoder_state,
                'start_tokens': tf.fill([batch_size_in], ch_word2idx['<start>']), 'end_token': ch_word2idx['<end>']
            }
        )
        translations = outputs.sample_id
        

    # calculate loss
    crossent = tf.nn.sparse_softmax_cross_entropy_with_logits(
        labels=y, logits=logits)
    train_loss = (tf.reduce_sum(crossent * target_weights) /
        tf.cast(batch_size_in,tf.float32))
    
    optimizer_ori = tf.train.GradientDescentOptimizer(learning_rate=learning_rate)
    trainable_params = tf.trainable_variables()
    gradients = tf.gradients(train_loss, trainable_params)
    # Gradient Clip
    clip_gradients, _ = tf.clip_by_global_norm(gradients, max_grad)
    global_step = tf.Variable(0, trainable=False, name='global_step')
    optimizer = optimizer_ori.apply_gradients(
            zip(clip_gradients, trainable_params), global_step=global_step)

Object was never used (type <class 'tensorflow.python.framework.ops.Operation'>):
<tf.Operation 'decode_layer_1/decoder/assert_greater/Assert/Assert' type=Assert>
If you want to mark it as used call its "mark_used()" method.
It was originally created here:
  File "/Users/songyihe/opt/anaconda3/envs/ai_camp/lib/python3.6/site-packages/tensorflow/python/util/dispatch.py", line 201, in wrapper
    return target(*args, **kwargs)  File "/Users/songyihe/opt/anaconda3/envs/ai_camp/lib/python3.6/site-packages/tensorflow/python/ops/check_ops.py", line 992, in assert_greater
    y, data, summarize, message, name)  File "/Users/songyihe/opt/anaconda3/envs/ai_camp/lib/python3.6/site-packages/tensorflow/python/ops/check_ops.py", line 373, in _binary_assert
    return control_flow_ops.Assert(condition, data, summarize=summarize)  File "/Users/songyihe/opt/anaconda3/envs/ai_camp/lib/python3.6/site-packages/tensorflow/python/util/dispatch.py", line 201, in wrapper
    return target(*args, **kwargs)  F

In [53]:
session.run(tf.global_variables_initializer())

In [56]:
losses = []
beginning_lr = 4
max_epoch = 2
for one_epoch in range(0, max_epoch):
    index = np.asarray(list(range(len(data_x_in_pad))))
    np.random.shuffle(index)
    for i in tqdm(range(0,len(index), batch_size)):
        batchindex = index[i:i + batch_size]
        
        batch_lr = beginning_lr if one_epoch < 20 else beginning_lr * 0.5 ** (one_epoch - 20)
        if len(batchindex) < batch_size:
            break
        _,batch_loss = session.run([optimizer,train_loss],feed_dict={
            x:data_x_in_pad[batchindex],
            y:data_y_out_pad[batchindex],
            y_in:data_y_in_pad[batchindex],

            x_len:data_x_len[batchindex],
            y_len:data_y_len[batchindex],
            learning_rate:batch_lr,
        })
        losses.append(batch_loss)

  5%|▌         | 42/812 [00:14<04:28,  2.86it/s]


KeyboardInterrupt: 