In [0]:
# Imports
from __future__ import absolute_import, division, print_function
!pip install tensorflow-gpu==2.0.0a
from constants import *
warnings.filterwarnings("ignore")

In [0]:
RUN_NUMBER = 10
DEFAULT_GLOBAL_SEED = 1
ENCODER_MODEL = 'LSTM'
CHECKPOINT_PATH = './checkpoints/'+ENCODER_MODEL + '/' + str(RUN_NUMBER)
RESULTS_PATH = './results/' +"LARGE-"+ENCODER_MODEL + '/' + str(RUN_NUMBER) +'/'
ENCODING_PATH = './encoding/' +"LARGE-"+ENCODER_MODEL + '/' + str(RUN_NUMBER) +'/'
# Create target directory if doesn't exist
if not os.path.exists('./results/' + ENCODER_MODEL + '/' ):
    os.mkdir('./results/' + ENCODER_MODEL + '/')
TRAIN_FROM_SCRATCH = True
# Create run directory if it doesn't exist
if TRAIN_FROM_SCRATCH and not os.path.exists('./results/' + ENCODER_MODEL + 
                                            '/' + str(RUN_NUMBER) + '/'):
   os.mkdir('./results/' + ENCODER_MODEL + '/' + str(RUN_NUMBER) + '/')

BCN_DATASET = "SST-2" # or "SST-2" or "MMT"

if BCN_DATASET == "SST-5":
  elmoDir = "elmo-sst5"
elif BCN_DATASET == "SST-2":
  elmoDir = "elmo-sst2"


In [0]:
# Read the data
train_en, train_de, test_en, test_de, val_en, val_de = readdata()

# Run tokenization for English
tok_train_en, tok_val_en, tok_test_en, train_en_sen_len, val_en_sen_len,\
test_en_sen_len, en_dict_w2i, en_dict_i2w, en_max_words = tokenize(train_en, val_en, test_en, max_length=MAX_INPUT_SIZE)
del train_en, val_en, test_en

#add pad token
en_dict_w2i.update({'<PAD>':0})
en_dict_i2w.update({0:'<PAD>'})
en_vocab_size = np.amax(tok_train_en)

# Run tokenization for Deutsch
tok_train_de, tok_val_de, tok_test_de, train_de_sen_len, val_de_sen_len,\
test_de_sen_len, de_dict_w2i, de_dict_i2w, de_max_words = tokenize(train_de, val_de, test_de, max_length=MAX_INPUT_SIZE)
del train_de, val_de, test_de

#add pad token
de_dict_w2i.update({'<PAD>':0})
de_dict_i2w.update({0:'<PAD>'})
de_vocab_size = np.amax(tok_train_de)

# Create Glove Embedding dictionary
glove_embedding_matrix = create_embedding_indexmatrix(en_max_words, 
                                                      embedding_dim=EMBEDDING_DIM,
                                                      dict_en=en_dict_i2w)


In [0]:
# Initialize embedding and encoder
embed = Embedding(input_dim=en_max_words, output_dim=EMBEDDING_DIM,
                                 embeddings_initializer=Constant(glove_embedding_matrix),
                                 input_length=MAX_INPUT_SIZE,
                                 trainable=False)
encoder = None
if ENCODER_MODEL == 'LSTM':
  from Encoder3 import LSTMEncoder
  encoder = LSTMEncoder(batch_size=BATCH_SIZE,
                        drop_out=DROP_OUT,
                        r_drop_out=R_DROP_OUT, 
                        embedding_dim=EMBEDDING_DIM,
                        max_input_size=MAX_INPUT_SIZE)
elif ENCODER_MODEL == 'CNN':
  from Encoder import CNNEncoder
  encoder = CNNEncoder(batch_size=BATCH_SIZE,
                      drop_out=DROP_OUT,
                      embedding_dim=EMBEDDING_DIM,
                      max_input_size=MAX_INPUT_SIZE,
                      filter_size=CNN_FILTERS,
                      kernel_size=KERNEL_SIZE)
  
elif ENCODER_MODEL == 'ATTN':
  from Encoder import ATTNEncoder
  encoder = ATTNEncoder(batch_size=BATCH_SIZE, 
                        drop_out=DROP_OUT,
                        max_input_size=MAX_INPUT_SIZE, 
                        embedding_dim= EMBEDDING_DIM)
else:
  TypeError('Invalid Encoder Model given')

In [0]:
from Decoder import LSTMDecoder
decoder = LSTMDecoder(batch_size=BATCH_SIZE, 
                      drop_out=DROP_OUT, 
                      r_drop_out = R_DROP_OUT,
                      max_input_size=MAX_INPUT_SIZE, 
                      embedding_dim=EMBEDDING_DIM,
                      vocab_size =de_max_words)

In [0]:
optimizer = tf.keras.optimizers.Adam(lr = LEARNING_RATE)
def loss_function(real, pred):
  mask = 1 - np.equal(real, 0)
  loss_ = tf.nn.sparse_softmax_cross_entropy_with_logits(labels=real, logits=pred) * mask
  return tf.reduce_mean(loss_)

In [0]:
BUFFER_SIZE = len(tok_train_en)
steps_per_epoch = len(tok_train_en)//BATCH_SIZE

# Create dataset
train_dataset = tf.data.Dataset.from_tensor_slices((tok_train_en, tok_train_de)).shuffle(BUFFER_SIZE)
train_dataset = train_dataset.batch(BATCH_SIZE)


In [0]:
checkpoint_prefix = os.path.join(CHECKPOINT_PATH, "ckpt")
checkpoint = tf.train.Checkpoint(optimizer=optimizer,
                                 encoder=encoder,
                                 decoder=decoder)
TRAIN_FROM_SCRATCH= False
if not TRAIN_FROM_SCRATCH:
  # Required for TF to recognize input/ouput:
  c_t = decoder.initialize_hidden_state()
  h_t = decoder.initialize_hidden_state()
  example_input_batch, example_target_batch = next(iter(train_dataset))
  H, _, _,_,_ = encoder(embed(example_input_batch))
  z_t = K.cast(tf.expand_dims([de_dict_w2i['bos']] * BATCH_SIZE, 1), dtype='float32')
  context = decoder.initialize_hidden_state()
  predictions, h_t, c_t, _, _, context = decoder(z_t, h_t, c_t, c_t, c_t, H, context)
  # Load weights:
  checkpoint.restore(tf.train.latest_checkpoint(CHECKPOINT_PATH))
  print('loaded')


In [0]:
def read_sentiment_data(): 
  train_sent_path = os.path.join(elmoDir, 'train_dataset.txt')
  f_train_sent = open(train_sent_path, 'r', encoding='utf-8')
  train_sent = f_train_sent.read()
  f_train_sent.close()
  return train_sent

def read_sentiment_labels(): 
  labels = []
  train_sent_path = os.path.join(elmoDir, 'train_dataset_labels.txt')
  f_train_sent = open(train_sent_path, 'r', encoding='utf-8')
  for x in f_train_sent : 
    labels.append(float(x))
  f_train_sent.close()
  return labels 

def clean(data=None):
    data = re.sub('[0-9]+p*', 'n', data)  # replace all numbers with n
    data = re.sub('  ', ' ', data)  # remove double spaces
    data = re.sub("'", '', data)  # remove apostrophe
    data = data.split('\n')
    return data

# Read the SST data
train_sent = read_sentiment_data()
sent_labels = read_sentiment_labels()

train_sent = clean(train_sent)
tokenizer = Tokenizer(num_words=None, lower=True, oov_token='<UNK>')
tokenizer.fit_on_texts(train_sent)

# Tokenize with appropriate max_word length
tokenizer = Tokenizer(num_words=len(tokenizer.word_counts.items()), lower=True, oov_token='<UNK>')
tokenizer.fit_on_texts(train_sent)
train_sent_tok = tokenizer.texts_to_sequences(train_sent)
vocab = {k: v for k, v in tokenizer.word_counts.items() if v >= 1}
vocab_len = len(vocab)

# Max length in the SST training set was 58
train_sent_tok = pad_sequences(train_sent_tok, maxlen=58, truncating='post',
                          padding='post', value=0)

i2w = {v: k for k, v in tokenizer.word_index.items()}
# Create Glove Embedding dictionary
glove_embedding_matrix_n = create_embedding_indexmatrix(vocab_len, 
                                                      embedding_dim=300,
                                                      dict_en=i2w)

  
embed_glove = Embedding(input_dim=vocab_len, output_dim=300,
                               embeddings_initializer=Constant(glove_embedding_matrix_n),
                               input_length=58,
                               trainable=False)


In [0]:

## Create ENCODINGS for Sentiment data set
embeddings = []
train_dataset = tf.data.Dataset.from_tensor_slices(train_sent_tok)
train_dataset = train_dataset.batch(128)
embeddings_f = open(ENCODING_PATH +BCN_DATASET+ 'encodings',"w+")
iterator = iter(train_dataset)
example_input_batch= next(iter(train_dataset))
num_to_take_off= 0
H, h_t, c_t,_,_= encoder(embed_glove(example_input_batch))
for i in iterator:
  glove_embeddings = embed_glove(i)
  num_in_embdded = glove_embeddings.shape[0]
  padding = embed_glove(example_input_batch[:128-glove_embeddings.shape[0]])
  if len(padding) > 0:
    glove_embeddings = np.append(glove_embeddings, padding, axis=0)
  H, h_t, c_t,_,_ = encoder(glove_embeddings)
  index = 0
  for enc in H:
    embedding = np.concatenate((glove_embeddings[index],enc),axis =1)
    if index <= num_in_embdded:
      embeddings.append(embedding)
    index+=1

In [0]:
import pickle
with open(ENCODING_PATH +BCN_DATASET+ 'encodings', 'wb') as fp:
    pickle.dump(embeddings, fp)
fp.close()

In [0]:
with open (ENCODING_PATH +BCN_DATASET+ 'encodings', 'rb') as fp:
    itemlist = pickle.load(fp)
print(itemlist[0])