Skip to content
Permalink
Browse files

Merge

  • Loading branch information...
zeynepakkalyoncu committed Dec 19, 2018
2 parents ee0a9f9 + 5da1f5a commit 3cccfaf9bb6af07e6fcc2d7ebf53e7fd85247300
Showing with 41 additions and 24 deletions.
  1. +6 −6 lib/data/util.py
  2. +1 −1 lib/model/args.py
  3. +20 −16 lib/model/seq2seq.py
  4. +14 −1 lib/model/util.py
@@ -1,4 +1,5 @@
import os
import re

import dill
import nltk
@@ -9,19 +10,18 @@


def preprocess(source_data, target_data):
# TODO: Preprocess in one pass
# Convert to lowercase characters
source_data = source_data.swifter.apply(lambda x: x.str.lower())
target_data = target_data.swifter.apply(lambda x: x.str.lower())

# Remove punctuation
# Note: Does not remove underscores
source_data = source_data.swifter.apply(lambda x: x.str.replace(r'[^\w\s]', ''))
target_data = target_data.swifter.apply(lambda x: x.str.replace(r'[^\w\s]', ''))

# Add SOS and EOS tokens
target_data = target_data.swifter.apply(lambda x: 'SOS ' + x + ' EOS')

# Remove punctuation and digits
# WARNING: Removes special characters in some languages
# source_data = source_data.swifter.apply(lambda x: x.str.replace('[^a-zA-Z\s]', ''))
# target_data = target_data.swifter.apply(lambda x: x.str.replace('[^a-zA-Z\s]', ''))

source_data = source_data.values.flatten()
target_data = target_data.values.flatten()
return source_data, target_data
@@ -20,7 +20,7 @@ def get_args():
parser.add_argument('--target-vocab-size', type=int, default=10000)
parser.add_argument('--lr', type=float, default=0.001)
parser.add_argument('--decay', type=float, default=0.0)
parser.add_argument('--beam-size', type=int, default=2)
parser.add_argument('--beam-size', type=int, default=1)
parser.add_argument('--seed', type=int, default=3435)
parser.add_argument('--dataset', type=str, default='en_vi', choices=['en_de', 'de_en', 'en_vi', 'vi_en'])
parser.add_argument('--patience', type=int, default=5)
@@ -11,15 +11,16 @@
from keras.callbacks import ModelCheckpoint

from lib.model.metrics import bleu_score
from lib.model.util import lr_scheduler
from lib.model.util import lr_scheduler, TimeHistory


class Seq2Seq:
def __init__(self, config):
self.config = config

if self.config.cpu:
devices = list('/cpu:' + str(x) for x in (0, 0))
if not self.config.cpu:
else:
devices = list('/gpu:' + x for x in config.devices)

# Encoder
@@ -43,13 +44,13 @@ def encode(self, encoder_inputs, recurrent_unit='lstm'):
encoder_embedding = Embedding(self.config.source_vocab_size, self.config.embedding_dim,
weights=[self.config.source_embedding_map], trainable=False)
encoder_embedded = encoder_embedding(encoder_inputs)
if recurrent_unit == 'lstm':
if recurrent_unit.lower() == 'lstm':
encoder = LSTM(self.config.hidden_dim, return_state=True, return_sequences=True, recurrent_initializer=initial_weights)(encoder_embedded)
for i in range(1, self.config.num_encoder_layers):
encoder = LSTM(self.config.hidden_dim, return_state=True, return_sequences=True)(encoder)
_, state_h, state_c = encoder
return [state_h, state_c]
else: # GRU
else:
encoder = GRU(self.config.hidden_dim, return_state=True, return_sequences=True, recurrent_initializer=initial_weights)(encoder_embedded)
for i in range(1, self.config.num_encoder_layers):
encoder = GRU(self.config.hidden_dim, return_state=True, return_sequences=True)(encoder)
@@ -60,24 +61,25 @@ def decode(self, decoder_inputs, encoder_states, recurrent_unit='lstm'):
decoder_embedding = Embedding(self.config.target_vocab_size, self.config.embedding_dim,
weights=[self.config.target_embedding_map], trainable=False)
decoder_embedded = decoder_embedding(decoder_inputs)
if recurrent_unit == 'lstm':
if recurrent_unit.lower() == 'lstm':
decoder = LSTM(self.config.hidden_dim, return_state=True, return_sequences=True)(decoder_embedded, initial_state=encoder_states) # Accepts concatenated encoder states as input
for i in range(1, self.config.num_decoder_layers):
decoder = LSTM(self.config.hidden_dim, return_state=True, return_sequences=True)(decoder) # Use the final encoder state as context
decoder = LSTM(self.config.hidden_dim, return_state=True, return_sequences=True)(decoder) # Use the final encoder state as context
decoder_outputs, decoder_states = decoder[0], decoder[1:]
else: # GRU
else:
decoder = GRU(self.config.hidden_dim, return_state=True, return_sequences=True)(decoder_embedded, initial_state=encoder_states) # Accepts concatenated encoder states as input
for i in range(1, self.config.num_decoder_layers):
decoder = GRU(self.config.hidden_dim, return_state=True, return_sequences=True)(decoder) # Use the final encoder state as context
decoder = GRU(self.config.hidden_dim, return_state=True, return_sequences=True)(decoder) # Use the final encoder state as context
decoder_outputs, decoder_states = decoder[0], decoder[1]
decoder_dense = Dense(self.config.target_vocab_size, activation='softmax')
return decoder_dense(decoder_outputs)

def train(self, encoder_train_input, decoder_train_input, decoder_train_target):
checkpoint_filename = \
'ep{epoch:02d}_nl%d_ds%d_sv%d_sv%d_tv%d.hdf5' % (self.config.num_encoder_layers, self.config.num_decoder_layers, self.config.dataset_size,
self.config.source_vocab_size, self.config.target_vocab_size)
callbacks = [lr_scheduler(initial_lr=self.config.lr, decay_factor=self.config.decay),
'ep{epoch:02d}_el%d_dl%d_ds%d_sv%d_tv%d.hdf5' % (self.config.num_encoder_layers, self.config.num_decoder_layers, self.config.dataset_size,
self.config.source_vocab_size, self.config.target_vocab_size)
time_callback = TimeHistory()
callbacks = [lr_scheduler(initial_lr=self.config.lr, decay_factor=self.config.decay), time_callback,
ModelCheckpoint(os.path.join(os.getcwd(), 'data', 'checkpoints', self.config.dataset, checkpoint_filename),
monitor='val_loss', verbose=1, save_best_only=False,
save_weights_only=True, mode='auto', period=1)]
@@ -89,14 +91,16 @@ def train(self, encoder_train_input, decoder_train_input, decoder_train_target):

def train_generator(self, training_generator, validation_generator):
checkpoint_filename = \
'ep{epoch:02d}_nl%d_ds%d_sv%d_sv%d_tv%d.hdf5' % (self.config.num_encoder_layers, self.config.num_decoder_layers, self.config.dataset_size,
self.config.source_vocab_size, self.config.target_vocab_size)
callbacks = [lr_scheduler(initial_lr=self.config.lr, decay_factor=self.config.decay),
'ep{epoch:02d}_el%d_dl%d_ds%d_sv%d_tv%d.hdf5' % (self.config.num_encoder_layers, self.config.num_decoder_layers, self.config.dataset_size,
self.config.source_vocab_size, self.config.target_vocab_size)
time_callback = TimeHistory()
callbacks = [lr_scheduler(initial_lr=self.config.lr, decay_factor=self.config.decay), time_callback,
ModelCheckpoint(os.path.join(os.getcwd(), 'data', 'checkpoints', self.config.dataset, checkpoint_filename),
monitor='val_loss', verbose=1, save_best_only=False,
save_weights_only=True, mode='auto', period=1)]
self.model.fit_generator(training_generator, epochs=self.config.epochs, callbacks=callbacks,
validation_data=validation_generator)
print("Training time (in seconds):", time_callback.times)

def predict(self, encoder_predict_input, decoder_predict_input):
return self.model.predict([encoder_predict_input, decoder_predict_input])
@@ -117,9 +121,9 @@ def beam_search(self, encoder_predict_input):
list(hyp[:(i + 1)]) + [next_hyp] + ([0] * (encoder_predict_input.shape[0] - i - 1))
))

k_beam = sorted(all_hypotheses, key=lambda x: x[0])[-beam_size:] # Sort by probability
k_beam = sorted(all_hypotheses, key=lambda x: x[0])[-beam_size:] # Sort by probability

return k_beam[-1][1] # Pick hypothesis with highest probability
return k_beam[-1][1] # Pick hypothesis with highest probability

def evaluate(self, encoder_predict_input, decoder_predict_input, decoder_train_target):
if self.config.beam_size > 0:
@@ -1,12 +1,25 @@
import codecs
import os
import time

import dill
import keras
import numpy as np
from keras.callbacks import LearningRateScheduler
import keras.backend as K
from tqdm import tqdm


class TimeHistory(keras.callbacks.Callback):
def on_train_begin(self, logs=dict()):
self.times = []

def on_epoch_begin(self, epoch, logs=dict()):
self.epoch_time_start = time.time()

def on_epoch_end(self, epoch, logs=dict()):
self.times.append(time.time() - self.epoch_time_start)


def lr_scheduler(initial_lr, decay_factor):
def schedule(epoch):
if epoch and epoch < 5:

0 comments on commit 3cccfaf

Please sign in to comment.
You can’t perform that action at this time.